check-files-for-bad-links at main · zzstoatzz.io/scripts

zzstoatzz.io / scripts
for assorted things
scripts / check-files-for-bad-links
at main 4.9 kB view raw
  1#!/usr/bin/env -S uv run --script --quiet
  2# /// script
  3# requires-python = ">=3.12"
  4# dependencies = ["httpx"]
  5# ///
  6"""
  7Check files for bad links.
  8
  9Usage:
 10
 11```bash
 12./check-files-for-bad-links *.md
 13```
 14
 15Details:
 16- uses [`httpx`](https://www.python-httpx.org/) to check links
 17- uses [`anyio`](https://anyio.readthedocs.io/en/stable/) to run the checks concurrently
 18- pass include globs to scan (e.g. `*.md`)
 19- pass exclude globs to skip (e.g. `*.md`)
 20- pass ignore-url prefixes to ignore (e.g. `http://localhost` or `https://localhost`)
 21- pass concurrency to run the checks concurrently (default is 50)
 22"""
 23
 24import argparse
 25import glob
 26import re
 27import sys
 28from dataclasses import dataclass
 29from pathlib import Path
 30from typing import Iterable
 31
 32import anyio
 33import anyio.to_thread
 34import httpx
 35
 36GREY = "\033[90m"
 37GREEN = "\033[92m"
 38RED = "\033[91m"
 39_END = "\033[0m"
 40_URL_RE = re.compile(r'https?://[^\s<>"\')\]]+', re.IGNORECASE)
 41
 42
 43@dataclass(slots=True)
 44class LinkResult:
 45    url: str
 46    status: int | None
 47    ok: bool
 48    sources: frozenset[Path]
 49    error: str | None = None
 50
 51
 52async def extract_links(path: Path) -> set[str]:
 53    try:
 54        content = await anyio.to_thread.run_sync(path.read_text, "utf-8", "ignore")
 55        return {m.group(0).rstrip(".,)") for m in _URL_RE.finditer(content)}
 56    except Exception:
 57        return set()
 58
 59
 60async def _probe(client: httpx.AsyncClient, url: str) -> LinkResult:
 61    try:
 62        r = await client.head(url, follow_redirects=True)
 63        if r.status_code in {405, 403, 404}:
 64            r = await client.get(url, follow_redirects=True)
 65        return LinkResult(url, r.status_code, 200 <= r.status_code < 400, frozenset())
 66    except Exception as exc:
 67        return LinkResult(url, None, False, frozenset(), str(exc))
 68
 69
 70async def check_links(urls: Iterable[str], concurrency: int) -> list[LinkResult]:
 71    sem = anyio.Semaphore(concurrency)
 72    results: list[LinkResult] = []
 73
 74    async with httpx.AsyncClient(timeout=10) as client:
 75
 76        async def bound(u: str) -> None:
 77            async with sem:
 78                results.append(await _probe(client, u))
 79
 80        async with anyio.create_task_group() as tg:
 81            for url in urls:
 82                tg.start_soon(bound, url)
 83
 84    return results
 85
 86
 87async def audit(
 88    paths: set[Path],
 89    ignored_prefixes: tuple[str, ...],
 90    concurrency: int,
 91) -> list[LinkResult]:
 92    link_to_files: dict[str, set[Path]] = {}
 93
 94    async def process_file(p: Path) -> None:
 95        for url in await extract_links(p):
 96            if any(url.startswith(pref) for pref in ignored_prefixes):
 97                continue
 98            if re.search(r"{[^}]+}", url):  # skip template tokens like {var}
 99                continue
100            link_to_files.setdefault(url, set()).add(p)
101
102    chunk_size = 100
103    for i in range(0, len(paths), chunk_size):
104        paths_chunk = list(paths)[i : i + chunk_size]
105        async with anyio.create_task_group() as tg:
106            for path in paths_chunk:
107                tg.start_soon(process_file, path)
108
109    return [
110        LinkResult(
111            url=r.url,
112            status=r.status,
113            ok=r.ok,
114            sources=frozenset(link_to_files[r.url]),
115            error=r.error,
116        )
117        for r in await check_links(link_to_files, concurrency)
118    ]
119
120
121async def main() -> None:
122    parser = argparse.ArgumentParser(
123        description="Fail the build if any HTTP link is unreachable."
124    )
125    parser.add_argument("include", nargs="+", help="Glob pattern(s) to scan.")
126    parser.add_argument(
127        "--exclude", nargs="*", default=[], help="Glob pattern(s) to skip."
128    )
129    parser.add_argument(
130        "--ignore-url",
131        nargs="*",
132        default=("http://localhost", "https://localhost"),
133        metavar="PREFIX",
134        help="URL prefixes to ignore.",
135    )
136    parser.add_argument("-c", "--concurrency", type=int, default=50)
137
138    ns = parser.parse_args()
139
140    include = {Path(p) for pat in ns.include for p in glob.glob(pat, recursive=True)}
141    exclude = {Path(p) for pat in ns.exclude for p in glob.glob(pat, recursive=True)}
142
143    if not (files := include - exclude):
144        print("No files to scan.", file=sys.stderr)
145        sys.exit(2)
146
147    links = await audit(files, tuple(ns.ignore_url), concurrency=ns.concurrency)
148
149    broken_links: list[LinkResult] = []
150    for r in sorted(links, key=lambda x: sorted(x.sources)[0].as_posix()):
151        status = r.status or "ERR"
152        icon = f"{GREEN}✓{_END}" if r.ok else f"{RED}✗{_END}"
153        url_repr = r.url if r.ok else f"{RED}{r.url}{_END}"
154        srcs = ", ".join(s.as_posix() for s in sorted(r.sources))
155
156        print(f"{GREY}{srcs}:{_END} {status:>4} {icon} {url_repr}")
157
158        if not r.ok:
159            broken_links.append(r)
160
161    if broken_links:
162        print(f"\n{len(broken_links)} broken link(s) detected.", file=sys.stderr)
163        sys.exit(1)
164
165
166if __name__ == "__main__":
167    anyio.run(main)