#!/usr/bin/env python3 """ Rename files like `foo/bar.html` to `foo/bar/index.html` for prettier URLs. """ import json import re import sys from pathlib import Path from typing import Dict from typing import List def get_safe_path(root: Path, candidate: Path) -> Path: """ Return the safe path between two paths. This function checks that a candidate path is under the given root. If it is, it returns the candidate path unchanged. If not, it returns the topmost ancestor that is not part of the root, as the relative path. For illustration, some inputs and outputs: >>> get_safe_path("/var/www/mydocs", "/var/www/mydocs/foo") "/var/www/mydocs/foo" >>> get_safe_path("/var/www/mydocs", "/var/www/foo") "/var/www/mydocs/foo" >>> get_safe_path("/var/www/mydocs", "/foo") "/var/www/mydocs/foo" """ if not root.is_absolute() and candidate.is_absolute(): raise ValueError("Both paths must be absolute") try: # If the candidate is under the root, we're done. candidate.relative_to(root) return candidate except ValueError: pass # Otherwise, look for the first point of divergence from the root. for counter, part in enumerate(root.parts): if counter >= len(candidate.parts): parts: List[str] = [] break if part != candidate.parts[counter]: # Everything past that is what we need. parts = candidate.parts[counter:] break outpath = root # Tack the discovered parts onto the root. for part in parts: outpath /= part return outpath def convert_relative_to_absolute(path: Path): """ Convert relative links in files to absolute ones. Sometimes files contain links relative to their location, e.g. `public/test/foo.html` might contain a link to `bar/baz.html`, which makes it hard for us to match `public/test/bar/baz.html` that way. This function converts that into its full path (`public/test/bar/baz.html`) so we can match it later. """ def replace_wrapper(filename: Path): def replace_link(match: re.Match) -> str: property, text, suffix = match.groups() if ( "://" in text or text == "/" or text.startswith("#") or text.startswith("mailto") ): # Not a valid filename, return it. return f"{property}{text}{suffix}" if text.startswith("/"): filepath = (path / text[1:]).resolve() else: filepath = (filename.parent / text).resolve() filepath = get_safe_path(path, filepath) if not filepath.exists(): # Not a valid filename, return it. sys.exit(f"Possible broken link in {filename}: {text}") replacement_path = filepath.relative_to(path.absolute()) return f"{property}/{replacement_path}{suffix}" return replace_link for filename in path.glob("**/*.html"): print(f"Converting relative links in {filename}...") with filename.open("r+") as f: contents = f.read() f.truncate(0) f.seek(0) contents = re.sub( r""" ((?:href|src|root)\s*=\s*") # Various tags like href=" ([^\"]+?) # Anything non-quote, non-greedily. ( (?:\#[^\"]*|) # Either an anchor or nothing. ") # Ending quote. """, replace_wrapper(filename), contents, flags=re.VERBOSE, ) f.write(contents) def replace_links(path: Path, replacements: Dict[str, str]): """ Convert links in files. `path` - The root of the repo. `replacements` - A dictionary of replacement URLs in the form of `{"dir/file.html": "dir/file/"}`. """ for filename in path.glob("**/*.html"): print(f"Converting links in {filename}...") with filename.open("r+") as f: contents = f.read() f.truncate(0) f.seek(0) for source, target in replacements.items(): contents = contents.replace(source, target) # Convert relative links to absolute. contents = re.sub( r'((?:href|src|root)\s*=\s*")((?:\.\./)+)([^\.])', r"\1/\3", contents ) f.write(contents) def fix_search_links(path: Path): """Rename search links to remove the ".html" suffix.""" with (path / "searchindex.json").open() as infile: index = json.load(infile) doc_urls = [url.replace(".html", "/") for url in index["doc_urls"]] index["doc_urls"] = doc_urls index_json = json.dumps(index).strip() with (path / "searchindex.json").open("w") as outfile: outfile.write(index_json) with (path / "searchindex.js").open("w") as outfile: outfile.write(f"Object.assign(window.search, {index_json});") def main(path: Path): path = path.resolve() convert_relative_to_absolute(path) replacements: Dict[str, str] = {} for p in path.glob("**/*.html"): if str(p.parent) == ".": # Don't convert top-level files. continue if p.name == "index.html": # Don't convert top-level files. continue print(f"Renaming {p}...") dir_path = p.parent / p.stem dir_path.mkdir(parents=True, exist_ok=True) new_path = dir_path / "index.html" p.rename(new_path) # Construct the dictionary of replacements that have been done. replacements[str(p.relative_to(path))] = f"{new_path.parent.relative_to(path)}/" replace_links(path, replacements) print("Rewriting search links...") fix_search_links(path) if __name__ == "__main__": main(Path(sys.argv[1]))