joplin-mdbooks-website/move_html_to_dir

172 lines
5.3 KiB
Plaintext
Raw Normal View History

2021-11-20 10:22:59 -05:00
#!/usr/bin/env python3
"""
Rename files like `foo/bar.html` to `foo/bar/index.html` for prettier URLs.
"""
import re
import sys
from pathlib import Path
from typing import Dict
2021-12-14 19:41:34 -05:00
from typing import List
def get_safe_path(root: Path, candidate: Path) -> Path:
"""
Return the safe path between two paths.
This function checks that a candidate path is under the given root. If it is, it
returns the candidate path unchanged. If not, it returns the topmost ancestor that
is not part of the root, as the relative path.
For illustration, some inputs and outputs:
>>> get_safe_path("/var/www/mydocs", "/var/www/mydocs/foo")
"/var/www/mydocs/foo"
>>> get_safe_path("/var/www/mydocs", "/var/www/foo")
"/var/www/mydocs/foo"
>>> get_safe_path("/var/www/mydocs", "/foo")
"/var/www/mydocs/foo"
"""
if not root.is_absolute() and candidate.is_absolute():
raise ValueError("Both paths must be absolute")
try:
# If the candidate is under the root, we're done.
candidate.relative_to(root)
return candidate
except ValueError:
pass
# Otherwise, look for the first point of divergence from the root.
for counter, part in enumerate(root.parts):
if counter >= len(candidate.parts):
parts: List[str] = []
break
if part != candidate.parts[counter]:
# Everything past that is what we need.
parts = candidate.parts[counter:]
break
outpath = root
# Tack the discovered parts onto the root.
for part in parts:
outpath /= part
return outpath
2021-11-20 10:22:59 -05:00
2021-12-14 17:27:35 -05:00
def convert_relative_to_absolute(path: Path):
"""
Convert relative links in files to absolute ones.
Sometimes files contain links relative to their location, e.g.
`public/test/foo.html` might contain a link to `bar/baz.html`, which makes it hard
for us to match `public/test/bar/baz.html` that way. This function converts that
into its full path (`public/test/bar/baz.html`) so we can match it later.
"""
def replace_wrapper(filename: Path):
def replace_link(match: re.Match) -> str:
property, text, suffix = match.groups()
2021-12-14 19:41:34 -05:00
if (
"://" in text
or text == "/"
or text.startswith("#")
or text.startswith("mailto")
):
2021-12-14 17:27:35 -05:00
# Not a valid filename, return it.
return f"{property}{text}{suffix}"
if text.startswith("/"):
filepath = (path / text[1:]).resolve()
else:
filepath = (filename.parent / text).resolve()
2021-12-14 19:41:34 -05:00
filepath = get_safe_path(path, filepath)
2021-12-14 17:27:35 -05:00
if not filepath.exists():
# Not a valid filename, return it.
sys.exit(f"Possible broken link in {filename}: {text}")
replacement_path = filepath.relative_to(path.absolute())
return f"{property}/{replacement_path}{suffix}"
return replace_link
for filename in path.glob("**/*.html"):
print(f"Converting relative links in {filename}...")
with filename.open("r+") as f:
contents = f.read()
f.truncate(0)
f.seek(0)
contents = re.sub(
r"""
((?:href|src|root)\s*=\s*") # Various tags like href="
([^\"]+?) # Anything non-quote, non-greedily.
(
(?:\#[^\"]*|) # Either an anchor or nothing.
") # Ending quote.
""",
replace_wrapper(filename),
contents,
flags=re.VERBOSE,
)
f.write(contents)
2021-11-20 10:22:59 -05:00
def replace_links(path: Path, replacements: Dict[str, str]):
2021-12-14 17:27:35 -05:00
"""
Convert links in files.
`path` - The root of the repo.
`replacements` - A dictionary of replacement URLs in the form of
`{"dir/file.html": "dir/file/"}`.
"""
2021-11-20 10:22:59 -05:00
for filename in path.glob("**/*.html"):
2021-12-14 17:27:35 -05:00
print(f"Converting links in {filename}...")
2021-11-20 10:22:59 -05:00
with filename.open("r+") as f:
contents = f.read()
f.truncate(0)
f.seek(0)
for source, target in replacements.items():
contents = contents.replace(source, target)
2021-12-14 17:27:35 -05:00
# Convert relative links to absolute.
2021-11-20 10:22:59 -05:00
contents = re.sub(
2021-11-20 10:29:56 -05:00
r'((?:href|src|root)\s*=\s*")((?:\.\./)+)([^\.])', r"\1/\3", contents
2021-11-20 10:22:59 -05:00
)
f.write(contents)
def main(path: Path):
2021-12-14 19:41:34 -05:00
path = path.resolve()
2021-12-14 17:27:35 -05:00
convert_relative_to_absolute(path)
2021-11-20 11:10:12 -05:00
replacements: Dict[str, str] = {}
2021-11-20 10:22:59 -05:00
for p in path.glob("**/*.html"):
if str(p.parent) == ".":
# Don't convert top-level files.
continue
if p.name == "index.html":
# Don't convert top-level files.
continue
2021-12-14 17:27:35 -05:00
print(f"Renaming {p}...")
2021-11-20 10:22:59 -05:00
dir_path = p.parent / p.stem
dir_path.mkdir(parents=True, exist_ok=True)
new_path = dir_path / "index.html"
p.rename(new_path)
2021-12-14 17:27:35 -05:00
# Construct the dictionary of replacements that have been done.
2021-11-20 10:54:59 -05:00
replacements[str(p.relative_to(path))] = f"{new_path.parent.relative_to(path)}/"
2021-11-20 10:22:59 -05:00
replace_links(path, replacements)
if __name__ == "__main__":
main(Path(sys.argv[1]))