joplin-mdbooks-website/move_html_to_dir
Stavros Korokithakis fa44220edf
Fix search links
2022-01-18 18:17:13 +02:00

193 lines
5.9 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Rename files like `foo/bar.html` to `foo/bar/index.html` for prettier URLs.
"""
import json
import re
import sys
from pathlib import Path
from typing import Dict
from typing import List
def get_safe_path(root: Path, candidate: Path) -> Path:
"""
Return the safe path between two paths.
This function checks that a candidate path is under the given root. If it is, it
returns the candidate path unchanged. If not, it returns the topmost ancestor that
is not part of the root, as the relative path.
For illustration, some inputs and outputs:
>>> get_safe_path("/var/www/mydocs", "/var/www/mydocs/foo")
"/var/www/mydocs/foo"
>>> get_safe_path("/var/www/mydocs", "/var/www/foo")
"/var/www/mydocs/foo"
>>> get_safe_path("/var/www/mydocs", "/foo")
"/var/www/mydocs/foo"
"""
if not root.is_absolute() and candidate.is_absolute():
raise ValueError("Both paths must be absolute")
try:
# If the candidate is under the root, we're done.
candidate.relative_to(root)
return candidate
except ValueError:
pass
# Otherwise, look for the first point of divergence from the root.
for counter, part in enumerate(root.parts):
if counter >= len(candidate.parts):
parts: List[str] = []
break
if part != candidate.parts[counter]:
# Everything past that is what we need.
parts = candidate.parts[counter:]
break
outpath = root
# Tack the discovered parts onto the root.
for part in parts:
outpath /= part
return outpath
def convert_relative_to_absolute(path: Path):
"""
Convert relative links in files to absolute ones.
Sometimes files contain links relative to their location, e.g.
`public/test/foo.html` might contain a link to `bar/baz.html`, which makes it hard
for us to match `public/test/bar/baz.html` that way. This function converts that
into its full path (`public/test/bar/baz.html`) so we can match it later.
"""
def replace_wrapper(filename: Path):
def replace_link(match: re.Match) -> str:
property, text, suffix = match.groups()
if (
"://" in text
or text == "/"
or text.startswith("#")
or text.startswith("mailto")
):
# Not a valid filename, return it.
return f"{property}{text}{suffix}"
if text.startswith("/"):
filepath = (path / text[1:]).resolve()
else:
filepath = (filename.parent / text).resolve()
filepath = get_safe_path(path, filepath)
if not filepath.exists():
# Not a valid filename, return it.
sys.exit(f"Possible broken link in {filename}: {text}")
replacement_path = filepath.relative_to(path.absolute())
return f"{property}/{replacement_path}{suffix}"
return replace_link
for filename in path.glob("**/*.html"):
print(f"Converting relative links in {filename}...")
with filename.open("r+") as f:
contents = f.read()
f.truncate(0)
f.seek(0)
contents = re.sub(
r"""
((?:href|src|root)\s*=\s*") # Various tags like href="
([^\"]+?) # Anything non-quote, non-greedily.
(
(?:\#[^\"]*|) # Either an anchor or nothing.
") # Ending quote.
""",
replace_wrapper(filename),
contents,
flags=re.VERBOSE,
)
f.write(contents)
def replace_links(path: Path, replacements: Dict[str, str]):
"""
Convert links in files.
`path` - The root of the repo.
`replacements` - A dictionary of replacement URLs in the form of
`{"dir/file.html": "dir/file/"}`.
"""
for filename in path.glob("**/*.html"):
print(f"Converting links in {filename}...")
with filename.open("r+") as f:
contents = f.read()
f.truncate(0)
f.seek(0)
for source, target in replacements.items():
contents = contents.replace(source, target)
# Convert relative links to absolute.
contents = re.sub(
r'((?:href|src|root)\s*=\s*")((?:\.\./)+)([^\.])', r"\1/\3", contents
)
f.write(contents)
def fix_search_links(path: Path):
"""Rename search links to remove the ".html" suffix."""
with (path / "searchindex.json").open() as infile:
index = json.load(infile)
doc_urls = [url.replace(".html", "/") for url in index["doc_urls"]]
index["doc_urls"] = doc_urls
index_json = json.dumps(index).strip()
with (path / "searchindex.json").open("w") as outfile:
outfile.write(index_json)
with (path / "searchindex.js").open("w") as outfile:
outfile.write(f"Object.assign(window.search, {index_json});")
def main(path: Path):
path = path.resolve()
convert_relative_to_absolute(path)
replacements: Dict[str, str] = {}
for p in path.glob("**/*.html"):
if str(p.parent) == ".":
# Don't convert top-level files.
continue
if p.name == "index.html":
# Don't convert top-level files.
continue
print(f"Renaming {p}...")
dir_path = p.parent / p.stem
dir_path.mkdir(parents=True, exist_ok=True)
new_path = dir_path / "index.html"
p.rename(new_path)
# Construct the dictionary of replacements that have been done.
replacements[str(p.relative_to(path))] = f"{new_path.parent.relative_to(path)}/"
replace_links(path, replacements)
print("Rewriting search links...")
fix_search_links(path)
if __name__ == "__main__":
main(Path(sys.argv[1]))