193 lines
5.9 KiB
Python
Executable File
193 lines
5.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Rename files like `foo/bar.html` to `foo/bar/index.html` for prettier URLs.
|
|
"""
|
|
import json
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Dict
|
|
from typing import List
|
|
|
|
|
|
def get_safe_path(root: Path, candidate: Path) -> Path:
|
|
"""
|
|
Return the safe path between two paths.
|
|
|
|
This function checks that a candidate path is under the given root. If it is, it
|
|
returns the candidate path unchanged. If not, it returns the topmost ancestor that
|
|
is not part of the root, as the relative path.
|
|
|
|
For illustration, some inputs and outputs:
|
|
>>> get_safe_path("/var/www/mydocs", "/var/www/mydocs/foo")
|
|
"/var/www/mydocs/foo"
|
|
|
|
>>> get_safe_path("/var/www/mydocs", "/var/www/foo")
|
|
"/var/www/mydocs/foo"
|
|
|
|
>>> get_safe_path("/var/www/mydocs", "/foo")
|
|
"/var/www/mydocs/foo"
|
|
"""
|
|
if not root.is_absolute() and candidate.is_absolute():
|
|
raise ValueError("Both paths must be absolute")
|
|
|
|
try:
|
|
# If the candidate is under the root, we're done.
|
|
candidate.relative_to(root)
|
|
return candidate
|
|
except ValueError:
|
|
pass
|
|
|
|
# Otherwise, look for the first point of divergence from the root.
|
|
for counter, part in enumerate(root.parts):
|
|
if counter >= len(candidate.parts):
|
|
parts: List[str] = []
|
|
break
|
|
|
|
if part != candidate.parts[counter]:
|
|
# Everything past that is what we need.
|
|
parts = candidate.parts[counter:]
|
|
break
|
|
|
|
outpath = root
|
|
# Tack the discovered parts onto the root.
|
|
for part in parts:
|
|
outpath /= part
|
|
|
|
return outpath
|
|
|
|
|
|
def convert_relative_to_absolute(path: Path):
|
|
"""
|
|
Convert relative links in files to absolute ones.
|
|
|
|
Sometimes files contain links relative to their location, e.g.
|
|
`public/test/foo.html` might contain a link to `bar/baz.html`, which makes it hard
|
|
for us to match `public/test/bar/baz.html` that way. This function converts that
|
|
into its full path (`public/test/bar/baz.html`) so we can match it later.
|
|
"""
|
|
|
|
def replace_wrapper(filename: Path):
|
|
def replace_link(match: re.Match) -> str:
|
|
property, text, suffix = match.groups()
|
|
if (
|
|
"://" in text
|
|
or text == "/"
|
|
or text.startswith("#")
|
|
or text.startswith("mailto")
|
|
):
|
|
# Not a valid filename, return it.
|
|
return f"{property}{text}{suffix}"
|
|
|
|
if text.startswith("/"):
|
|
filepath = (path / text[1:]).resolve()
|
|
else:
|
|
filepath = (filename.parent / text).resolve()
|
|
|
|
filepath = get_safe_path(path, filepath)
|
|
if not filepath.exists():
|
|
# Not a valid filename, return it.
|
|
sys.exit(f"Possible broken link in {filename}: {text}")
|
|
|
|
replacement_path = filepath.relative_to(path.absolute())
|
|
return f"{property}/{replacement_path}{suffix}"
|
|
|
|
return replace_link
|
|
|
|
for filename in path.glob("**/*.html"):
|
|
print(f"Converting relative links in {filename}...")
|
|
with filename.open("r+") as f:
|
|
contents = f.read()
|
|
f.truncate(0)
|
|
f.seek(0)
|
|
contents = re.sub(
|
|
r"""
|
|
((?:href|src|root)\s*=\s*") # Various tags like href="
|
|
([^\"]+?) # Anything non-quote, non-greedily.
|
|
(
|
|
(?:\#[^\"]*|) # Either an anchor or nothing.
|
|
") # Ending quote.
|
|
""",
|
|
replace_wrapper(filename),
|
|
contents,
|
|
flags=re.VERBOSE,
|
|
)
|
|
f.write(contents)
|
|
|
|
|
|
def replace_links(path: Path, replacements: Dict[str, str]):
|
|
"""
|
|
Convert links in files.
|
|
|
|
`path` - The root of the repo.
|
|
`replacements` - A dictionary of replacement URLs in the form of
|
|
`{"dir/file.html": "dir/file/"}`.
|
|
"""
|
|
|
|
for filename in path.glob("**/*.html"):
|
|
print(f"Converting links in {filename}...")
|
|
with filename.open("r+") as f:
|
|
contents = f.read()
|
|
f.truncate(0)
|
|
f.seek(0)
|
|
for source, target in replacements.items():
|
|
contents = contents.replace(source, target)
|
|
|
|
# Convert relative links to absolute.
|
|
contents = re.sub(
|
|
r'((?:href|src|root)\s*=\s*")((?:\.\./)+)([^\.])', r"\1/\3", contents
|
|
)
|
|
|
|
f.write(contents)
|
|
|
|
|
|
def fix_search_links(path: Path):
|
|
"""Rename search links to remove the ".html" suffix."""
|
|
with (path / "searchindex.json").open() as infile:
|
|
index = json.load(infile)
|
|
|
|
doc_urls = [url.replace(".html", "/") for url in index["doc_urls"]]
|
|
index["doc_urls"] = doc_urls
|
|
|
|
index_json = json.dumps(index).strip()
|
|
|
|
with (path / "searchindex.json").open("w") as outfile:
|
|
outfile.write(index_json)
|
|
|
|
with (path / "searchindex.js").open("w") as outfile:
|
|
outfile.write(f"Object.assign(window.search, {index_json});")
|
|
|
|
|
|
def main(path: Path):
|
|
path = path.resolve()
|
|
convert_relative_to_absolute(path)
|
|
|
|
replacements: Dict[str, str] = {}
|
|
for p in path.glob("**/*.html"):
|
|
if str(p.parent) == ".":
|
|
# Don't convert top-level files.
|
|
continue
|
|
|
|
if p.name == "index.html":
|
|
# Don't convert top-level files.
|
|
continue
|
|
|
|
print(f"Renaming {p}...")
|
|
|
|
dir_path = p.parent / p.stem
|
|
dir_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
new_path = dir_path / "index.html"
|
|
p.rename(new_path)
|
|
# Construct the dictionary of replacements that have been done.
|
|
replacements[str(p.relative_to(path))] = f"{new_path.parent.relative_to(path)}/"
|
|
|
|
replace_links(path, replacements)
|
|
|
|
print("Rewriting search links...")
|
|
fix_search_links(path)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main(Path(sys.argv[1]))
|