joplin-mdbooks-website/move_html_to_dir

#!/usr/bin/env python3
"""
Rename files like `foo/bar.html` to `foo/bar/index.html` for prettier URLs.
"""
import json
import re
import sys
from pathlib import Path
from typing import Dict
from typing import List


def get_safe_path(root: Path, candidate: Path) -> Path:
    """
    Return the safe path between two paths.

    This function checks that a candidate path is under the given root. If it is, it
    returns the candidate path unchanged. If not, it returns the topmost ancestor that
    is not part of the root, as the relative path.

    For illustration, some inputs and outputs:
    >>> get_safe_path("/var/www/mydocs", "/var/www/mydocs/foo")
    "/var/www/mydocs/foo"

    >>> get_safe_path("/var/www/mydocs", "/var/www/foo")
    "/var/www/mydocs/foo"

    >>> get_safe_path("/var/www/mydocs", "/foo")
    "/var/www/mydocs/foo"
    """
    if not root.is_absolute() and candidate.is_absolute():
        raise ValueError("Both paths must be absolute")

    try:
        # If the candidate is under the root, we're done.
        candidate.relative_to(root)
        return candidate
    except ValueError:
        pass

    # Otherwise, look for the first point of divergence from the root.
    for counter, part in enumerate(root.parts):
        if counter >= len(candidate.parts):
            parts: List[str] = []
            break

        if part != candidate.parts[counter]:
            # Everything past that is what we need.
            parts = candidate.parts[counter:]
            break

    outpath = root
    # Tack the discovered parts onto the root.
    for part in parts:
        outpath /= part

    return outpath


def convert_relative_to_absolute(path: Path):
    """
    Convert relative links in files to absolute ones.

    Sometimes files contain links relative to their location, e.g.
    `public/test/foo.html` might contain a link to `bar/baz.html`, which makes it hard
    for us to match `public/test/bar/baz.html` that way. This function converts that
    into its full path (`public/test/bar/baz.html`) so we can match it later.
    """

    def replace_wrapper(filename: Path):
        def replace_link(match: re.Match) -> str:
            property, text, suffix = match.groups()
            if (
                "://" in text
                or text == "/"
                or text.startswith("#")
                or text.startswith("mailto")
            ):
                # Not a valid filename, return it.
                return f"{property}{text}{suffix}"

            if text.startswith("/"):
                filepath = (path / text[1:]).resolve()
            else:
                filepath = (filename.parent / text).resolve()

            filepath = get_safe_path(path, filepath)
            if not filepath.exists():
                # Not a valid filename, return it.
                sys.exit(f"Possible broken link in {filename}: {text}")

            replacement_path = filepath.relative_to(path.absolute())
            return f"{property}/{replacement_path}{suffix}"

        return replace_link

    for filename in path.glob("**/*.html"):
        print(f"Converting relative links in {filename}...")
        with filename.open("r+") as f:
            contents = f.read()
            f.truncate(0)
            f.seek(0)
            contents = re.sub(
                r"""
                ((?:href|src|root)\s*=\s*")  # Various tags like href="
                ([^\"]+?)                    # Anything non-quote, non-greedily.
                (
                (?:\#[^\"]*|)                # Either an anchor or nothing.
                ")                           # Ending quote.
                """,
                replace_wrapper(filename),
                contents,
                flags=re.VERBOSE,
            )
            f.write(contents)


def replace_links(path: Path, replacements: Dict[str, str]):
    """
    Convert links in files.

    `path`         - The root of the repo.
    `replacements` - A dictionary of replacement URLs in the form of
                     `{"dir/file.html": "dir/file/"}`.
    """

    for filename in path.glob("**/*.html"):
        print(f"Converting links in {filename}...")
        with filename.open("r+") as f:
            contents = f.read()
            f.truncate(0)
            f.seek(0)
            for source, target in replacements.items():
                contents = contents.replace(source, target)

            # Convert relative links to absolute.
            contents = re.sub(
                r'((?:href|src|root)\s*=\s*")((?:\.\./)+)([^\.])', r"\1/\3", contents
            )

            f.write(contents)


def fix_search_links(path: Path):
    """Rename search links to remove the ".html" suffix."""
    with (path / "searchindex.json").open() as infile:
        index = json.load(infile)

    doc_urls = [url.replace(".html", "/") for url in index["doc_urls"]]
    index["doc_urls"] = doc_urls

    index_json = json.dumps(index).strip()

    with (path / "searchindex.json").open("w") as outfile:
        outfile.write(index_json)

    with (path / "searchindex.js").open("w") as outfile:
        outfile.write(f"Object.assign(window.search, {index_json});")


def main(path: Path):
    path = path.resolve()
    convert_relative_to_absolute(path)

    replacements: Dict[str, str] = {}
    for p in path.glob("**/*.html"):
        if str(p.parent) == ".":
            # Don't convert top-level files.
            continue

        if p.name == "index.html":
            # Don't convert top-level files.
            continue

        print(f"Renaming {p}...")

        dir_path = p.parent / p.stem
        dir_path.mkdir(parents=True, exist_ok=True)

        new_path = dir_path / "index.html"
        p.rename(new_path)
        # Construct the dictionary of replacements that have been done.
        replacements[str(p.relative_to(path))] = f"{new_path.parent.relative_to(path)}/"

    replace_links(path, replacements)

    print("Rewriting search links...")
    fix_search_links(path)


if __name__ == "__main__":
    main(Path(sys.argv[1]))