Converts Kindle highlights to Markdown pages

This is a program I wrote in Python that takes highlights and notes exported from Amazon Kindle (HTML format) and converts them to clean and easy to read Markdown for use in Notion.

This helped me transfer all of my highlights from my Kindle and add them to my Reading List, a database in Notion that shows every book I’ve ever read and additional information about/from each book.

Download the source code

Here’s an example of kindle2notion in action:

When I export from Kindle, it gives me an HTML page of my highlights that looks like this:

This is the HTML behind that. takes that HTML, cleans it up, and formats it in Markdown:

Which I can then easily import into Notion:

Source Code


from bs4 import BeautifulSoup
import os
import sys
import re

def clean_up_soup(soup_clean):
    soup_clean = re.sub(r'(.)(\s)(\,|\.|\;|\:|\—|\-|\–|\!|\?)', r'\1\3', soup_clean)
    soup_clean = re.sub(r'(.)(\s)(\,|\.|\;|\:|\—|\-|\–|\!|\?)', r'\1\3', soup_clean)
    soup_clean = re.sub(r'(\«|\‘|\“|\'|\")(\s)(.)', r'\1\3', soup_clean)
    return soup_clean

def writeNotes(diluted_soup):
    notebook = open((bookTitle + '.md'),"a")
    for item in diluted_soup:
        if item['class'][0] == "sectionHeading":
            notebook.write('# ' + item.contents[0] + '\n')
        elif item['class'][0] == 'noteText':
            notebook.write('> ' + clean_up_soup(item.string) + '\n\n—\n\n')
        elif item['class'][0] == 'noteHeading':
            noteHeadingString = item.get_text(' ', strip=True)
            # notebook.write(noteHeadingString)
            pages = re.compile(r'Page [0-9]+|Location [0-9]+')
            # notebook.write(pages)
            pageNumbers = pages.findall(noteHeadingString)
            # notebook.write(pageNumbers)
            notebook.write(' - '.join(pageNumbers) + '\n\n')

for filename in os.listdir():
    if filename.endswith(".html"): 
        soup = BeautifulSoup(open(os.path.join(sys.path[0], filename), "r"), 'html.parser')
        bookTitle = re.sub('\n|\r', '', soup.find(True, {'class':['bookTitle']}).contents[0])
        diluted_soup = soup.find_all(True, {'class':['sectionHeading', 'noteHeading', 'noteText']})