joplin-mdbooks-website/joplinexport

386 lines
13 KiB
Plaintext
Raw Normal View History

2020-11-20 13:22:00 -05:00
#!/usr/bin/env python3
2021-01-30 19:02:40 -05:00
import dataclasses
2023-07-18 19:24:20 -04:00
import json
2021-01-30 19:02:40 -05:00
import mimetypes
2020-11-20 13:22:00 -05:00
import re
import sqlite3
from collections import defaultdict
2020-11-30 07:49:56 -05:00
from datetime import datetime
2020-11-20 13:22:00 -05:00
from pathlib import Path
2020-11-20 14:29:02 -05:00
from shutil import copy
2020-11-20 13:22:00 -05:00
from shutil import rmtree
2021-01-30 19:02:40 -05:00
from typing import Dict
from typing import List
2020-11-20 13:22:00 -05:00
from typing import Optional
2021-01-30 19:02:40 -05:00
from typing import Set
2021-11-19 21:13:37 -05:00
from typing import Union
2020-11-20 13:22:00 -05:00
2021-01-07 12:31:18 -05:00
def contains_word(word: str, text: str) -> bool:
"""
Check whether `text` contains `word`, as a whole word.
Case insensitive.
"""
return re.search(f"\\b{word}\\b".lower(), text.lower()) is not None
2020-11-20 13:22:00 -05:00
def slugify(text):
2021-01-07 16:17:37 -05:00
"""Convert `text` into a slug."""
2020-11-20 13:22:00 -05:00
return re.sub(r"[\W_]+", "-", text.lower()).strip("-")
2021-11-19 21:13:37 -05:00
@dataclasses.dataclass
class Folder:
"""A helper type for a folder."""
id: str
parent_id: str
title: str
2023-07-18 19:24:20 -04:00
icon: str
2021-11-19 21:13:37 -05:00
def is_for_blog(self) -> bool:
"""Return whether this folder is for posting to blog."""
"""Currently looks for the joplin folder named Blog"""
return contains_word("Blog", self.title)
2021-11-19 21:13:37 -05:00
def get_url(self) -> str:
"""Return the folder's relative URL."""
return slugify(self.title)
def get_summary_line(self, level: int) -> str:
"""Get the appropriate summary file line for this folder."""
2023-07-18 19:24:20 -04:00
return (
(" " * (level - 1))
+ f"- [{self.icon if self.icon else ''}"
2023-07-18 19:24:20 -04:00
+ f"{self.title}]({self.get_url()}/index.md)"
)
2021-11-19 21:13:37 -05:00
def __lt__(self, other: Union["Folder", "Note"]) -> bool:
"""Support comparison, for sorting."""
if isinstance(other, Note):
2023-07-18 19:27:35 -04:00
# Folders always come after notes.
return False
2021-11-19 21:13:37 -05:00
return self.title.lower() < other.title.lower()
def __repr__(self) -> str:
"""Pretty-print this class."""
return f"Folder: <{self.title}>"
2021-01-30 19:02:40 -05:00
@dataclasses.dataclass
2020-11-20 13:22:00 -05:00
class Note:
2021-01-07 16:17:37 -05:00
"""A helper type for a note."""
2021-01-30 19:02:40 -05:00
id: str
2021-11-19 21:13:37 -05:00
folder: Folder
2021-01-30 19:02:40 -05:00
title: str
body: str
updated_time: datetime
tags: List[str] = dataclasses.field(default_factory=list)
2020-11-20 13:22:00 -05:00
def is_for_blog(self) -> bool:
2021-11-19 21:13:37 -05:00
"""
Check whether a note is tagged for being published to blog.
2021-11-19 21:13:37 -05:00
This function checks a note's title and tags and returns whether it
should be whether it should be published or be considered private
2021-11-19 21:13:37 -05:00
"""
blog_keywords = ["blog"]
for keyword in blog_keywords:
2021-11-19 21:13:37 -05:00
if contains_word(keyword, self.title) or keyword in self.tags:
return True
return False
def get_url(self) -> str:
2021-01-07 16:17:37 -05:00
"""Return the note's relative URL."""
2021-11-19 21:13:37 -05:00
return slugify(self.folder.title) + "/" + slugify(self.title)
def get_summary_line(self, level: int) -> str:
"""
Get the appropriate summary file line for this note.
The introduction is level 0, and is treated differently here.
"""
return (
" " * (level - 1)
) + f"{'- ' if level > 0 else ''}[{self.title}]({self.get_url()}.md)"
2021-11-19 21:13:37 -05:00
def __lt__(self, other: Union["Folder", "Note"]) -> bool:
"""Support comparison, for sorting."""
return self.title.lower() < other.title.lower()
def __repr__(self) -> str:
"""Pretty-print this class."""
return f"Note: <{self.title}>"
2020-11-20 13:22:00 -05:00
2021-01-30 19:02:40 -05:00
@dataclasses.dataclass
class Resource:
"""A helper type for a resource."""
title: str
# The actual extension that the file stored in Joplin has.
extension: str
mimetype: str
@property
def derived_ext(self):
"""Return an extension derived from the resource's mime type."""
ext = mimetypes.guess_extension(self.mimetype, strict=False)
return "" if ext is None else ext
2020-11-20 13:22:00 -05:00
class JoplinExporter:
2021-01-07 16:17:37 -05:00
"""The main exporter class."""
2020-11-20 13:22:00 -05:00
content_dir = Path("content")
2022-07-23 11:39:21 -04:00
static_dir = Path("static/resources")
2020-11-20 14:29:02 -05:00
joplin_dir = Path.home() / ".config/joplin-desktop"
2020-11-20 13:22:00 -05:00
2021-01-26 18:14:14 -05:00
def __init__(self):
2021-01-30 19:02:40 -05:00
self.resources: Dict[str, Resource] = {}
self.used_resources: Set[str] = set()
2021-01-26 18:14:14 -05:00
2021-11-19 21:13:37 -05:00
# A mapping of {"note_id": Note()}.
self.note_lookup_dict: Dict[str, Note] = {}
# A mapping of {"folder_id": Folder()}.
self.folders: Dict[str, Folder] = {}
# A mapping of {"folder_id": [Note(), Note()]}.
self.notes: Dict[str, List[Note]] = defaultdict(list)
2020-11-20 13:22:00 -05:00
def clean_content_dir(self):
"""Reset the content directory to a known state to begin."""
2021-11-19 21:13:37 -05:00
rmtree(self.content_dir, ignore_errors=True)
rmtree(self.static_dir, ignore_errors=True)
self.content_dir.mkdir(parents=True)
self.static_dir.mkdir(parents=True)
2020-11-20 13:22:00 -05:00
def resolve_note_links(self, note: Note) -> str:
2021-01-07 16:17:37 -05:00
"""Resolve the links between notes and replace them in the body."""
2020-11-20 13:22:00 -05:00
def replacement(match):
2020-11-20 14:29:02 -05:00
item_id = match.group(1)
new_url = self.get_note_url_by_id(item_id)
2021-12-14 19:41:34 -05:00
if new_url:
new_url += ".html"
else:
2020-11-20 14:29:02 -05:00
new_url = self.get_resource_url_by_id(item_id)
if not new_url:
new_url = item_id
2021-07-05 18:48:34 -04:00
if match.group(2):
new_url += match.group(2)
2021-11-19 21:13:37 -05:00
return f"](/{new_url})"
2020-11-20 13:22:00 -05:00
2021-07-05 18:48:34 -04:00
return re.sub(r"\]\(:/([a-f0-9]{32})(#.*?)?\)", replacement, note.body)
2020-11-20 13:22:00 -05:00
def get_note_url_by_id(self, note_id: str) -> Optional[str]:
"""Return a note's relative URL by its ID."""
note = self.note_lookup_dict.get(note_id)
if not note:
return None
return note.get_url()
2020-11-20 14:29:02 -05:00
def get_resource_url_by_id(self, resource_id: str) -> Optional[str]:
"""Return a resource's relative URL by its ID."""
resource = self.resources.get(resource_id)
if not resource:
return None
2021-01-26 18:14:14 -05:00
# Add the resource to the set of used resources, so we can only copy
# the resources that are used.
self.used_resources.add(resource_id)
2021-01-30 19:02:40 -05:00
return "resources/" + resource_id + resource.derived_ext
2020-11-20 14:29:02 -05:00
def copy_resources(self):
2021-01-26 18:14:14 -05:00
"""Copy all the used resources to the output directory."""
for resource_id in self.used_resources:
resource = self.resources[resource_id]
print(f"{resource_id}.{resource.extension}")
2020-11-20 14:29:02 -05:00
copy(
2021-01-30 19:02:40 -05:00
self.joplin_dir / "resources" / (f"{resource_id}.{resource.extension}"),
self.static_dir / f"{resource_id}{resource.derived_ext}",
2020-11-20 14:29:02 -05:00
)
2020-11-20 13:22:00 -05:00
def read_data(self):
2020-11-20 14:29:02 -05:00
"""Read the data from the Joplin database."""
conn = sqlite3.connect(self.joplin_dir / "database.sqlite")
2020-11-20 13:22:00 -05:00
c = conn.cursor()
2023-07-18 19:24:20 -04:00
c.execute("""SELECT id, title, parent_id, icon FROM folders;""")
2021-11-19 21:13:37 -05:00
self.folders = {
2023-07-18 19:24:20 -04:00
id: Folder(
id, parent_id, title, json.loads(icon).get("emoji", "") if icon else ""
)
for id, title, parent_id, icon in c.fetchall()
2021-11-19 21:13:37 -05:00
}
self.folders = {
id: folder for id, folder in self.folders.items() if folder.is_for_blog()
2021-11-19 21:13:37 -05:00
}
2020-11-20 13:22:00 -05:00
2021-01-07 16:17:37 -05:00
# Get the tags by ID.
c.execute("""SELECT id, title FROM tags;""")
tags = {id: title for id, title in c.fetchall()}
# Get the tag IDs for each note ID.
c.execute("""SELECT note_id, tag_id FROM note_tags;""")
note_tags = defaultdict(list)
for note_id, tag_id in c.fetchall():
note_tags[note_id].append(tags[tag_id])
2021-01-30 19:02:40 -05:00
c.execute("""SELECT id, title, mime, file_extension FROM resources;""")
self.resources = {
id: Resource(
title=title,
extension=ext,
mimetype=mime,
)
for id, title, mime, ext in c.fetchall()
}
2020-11-20 13:22:00 -05:00
2020-11-29 22:21:36 -05:00
c.execute("""SELECT id, parent_id, title, body, updated_time FROM notes;""")
for id, parent_id, title, body, updated_time in c.fetchall():
2021-11-19 21:13:37 -05:00
if parent_id not in self.folders:
# This note is in a private folder, continue.
continue
2020-11-29 22:21:36 -05:00
note = Note(
2021-01-07 16:17:37 -05:00
id,
self.folders[parent_id],
title,
body,
2021-01-30 19:02:40 -05:00
datetime.fromtimestamp(updated_time / 1000),
2021-01-07 16:17:37 -05:00
tags=note_tags[id],
2020-11-29 22:21:36 -05:00
)
if note.is_for_blog():
2021-11-19 21:13:37 -05:00
continue
self.notes[note.folder.id].append(note)
2020-11-20 13:22:00 -05:00
self.note_lookup_dict[note.id] = note
conn.close()
2021-11-19 21:13:37 -05:00
def write_summary(self):
"""Write the SUMMARY.md that mdBook needs."""
# We construct a note tree by adding each note into its parent.
note_tree: Dict[str, List[Union[Note, Folder]]] = defaultdict(list)
# The note tree is a list of notes with their parents:
# [
# [parent1, parent2, note1]
# [parent1, parent3, note2]
# ]
# Then, we sort these by alphabetical order, and we're done.
note_tree = []
introduction: Optional[Note] = None # The "introduction" note.
folders: List[Folder] = list
for note_list in self.notes.values():
for note in note_list:
if note.folder.title == "Welcome":
introduction = note
continue
note_item = [note]
item: Union[Folder, Note] = note
while True:
if isinstance(item, Note):
item = item.folder
elif isinstance(item, Folder):
item = self.folders.get(item.parent_id)
if not item:
break
note_item.insert(0, item)
# Append the folders to the list if they weren't there before, as that's
# the only way this algorithm can generate headlines.
if folders != note_item[:-1]:
folders = note_item[:-1]
2023-07-18 19:24:20 -04:00
# Append all the parent folders of the current folder, as otherwise
# folders without a direct descendant note wouldn't show up.
# This will lead to duplicates, but we'll deduplicate later.
for x in range(1, len(folders) + 1):
note_tree.append(folders[:x])
2021-11-19 21:13:37 -05:00
note_tree.append(note_item)
note_tree.sort()
# Generate the summary file.
items = []
2023-07-18 19:24:20 -04:00
last_list = None
2021-11-19 21:13:37 -05:00
for note_list in note_tree:
2023-07-18 19:24:20 -04:00
if last_list == note_list:
# Remove duplicates from above here.
continue
last_list = note_list
2021-11-19 21:13:37 -05:00
level = len(note_list)
if isinstance(note_list[-1], Folder):
# The last item in the list is a folder, which means this is a header.
items.append(note_list[-1].get_summary_line(level))
else:
# This is a regular note.
note = note_list[-1]
print(f"Exporting {note.title}...")
items.append(note.get_summary_line(level))
with (self.content_dir / "SUMMARY.md").open("w") as outfile:
outfile.write("# Summary\n\n")
# Special-case the introduction.
2021-12-12 20:02:33 -05:00
if introduction:
outfile.write(introduction.get_summary_line(0) + "\n")
print("\n".join(items))
2021-11-19 21:13:37 -05:00
outfile.write("\n".join(items))
2020-11-20 13:22:00 -05:00
def export(self):
2021-01-07 16:17:37 -05:00
"""Export all the notes to a static site."""
2020-11-20 14:29:02 -05:00
self.read_data()
2021-11-19 21:13:37 -05:00
folder_list = sorted(self.folders.values())
2020-11-20 13:22:00 -05:00
self.clean_content_dir()
2021-11-19 21:13:37 -05:00
for folder in folder_list:
2020-11-20 15:10:56 -05:00
contents = []
2021-11-19 21:13:37 -05:00
dir = self.content_dir / folder.get_url()
2021-12-12 20:02:33 -05:00
dir.mkdir(parents=True, exist_ok=True)
2021-11-19 21:13:37 -05:00
for note in sorted(self.notes[folder.id], key=lambda n: n.title):
print(f"Exporting {folder.title} - {note.title}...")
contents.append((note.title, f"{note.get_url()}.html"))
2020-11-20 14:29:02 -05:00
with (self.content_dir / (note.get_url() + ".md")).open(
2021-12-12 20:02:33 -05:00
mode="w", encoding="utf-8"
2020-11-20 14:29:02 -05:00
) as outfile:
2020-11-20 13:22:00 -05:00
outfile.write(
2021-11-19 21:13:37 -05:00
f"""# {note.title}
2020-11-29 22:21:36 -05:00
{self.resolve_note_links(note)}
* * *
2021-01-04 06:59:06 -05:00
<p style="font-size:80%; font-style: italic">
2021-01-16 15:35:08 -05:00
Last updated on {note.updated_time:%B %d, %Y}. For any questions/feedback,
2021-01-04 06:59:06 -05:00
email me at <a href="mailto:hi@stavros.io">hi@stavros.io</a>.
</p>
2020-11-29 22:21:36 -05:00
"""
2020-11-20 13:22:00 -05:00
)
2021-11-19 21:13:37 -05:00
with (dir / "index.md").open(mode="w") as outfile:
2020-11-20 15:10:56 -05:00
contents_list = "\n1. ".join(
f"[{title}](../../{url})" for title, url in contents
)
outfile.write(
2021-11-19 21:13:37 -05:00
f"""# Contents
2020-11-20 15:10:56 -05:00
Click on a link in the list below to go to that page:
1. {contents_list}
"""
)
2021-11-19 21:13:37 -05:00
self.write_summary()
2021-01-26 18:14:14 -05:00
self.copy_resources()
2020-11-20 13:22:00 -05:00
if __name__ == "__main__":
print("Exporting Joplin database...")
2020-11-20 14:29:02 -05:00
JoplinExporter().export()