#!/usr/bin/env -S uv run --script --quiet # /// script # requires-python = ">=3.12" # dependencies = ["httpx"] # /// """ Check files for bad links. Usage: ```bash ./check-files-for-bad-links *.md ``` Details: - uses [`httpx`](https://www.python-httpx.org/) to check links - uses [`anyio`](https://anyio.readthedocs.io/en/stable/) to run the checks concurrently - pass include globs to scan (e.g. `*.md`) - pass exclude globs to skip (e.g. `*.md`) - pass ignore-url prefixes to ignore (e.g. `http://localhost` or `https://localhost`) - pass concurrency to run the checks concurrently (default is 50) """ import argparse import glob import re import sys from dataclasses import dataclass from pathlib import Path from typing import Iterable import anyio import anyio.to_thread import httpx GREY = "\033[90m" GREEN = "\033[92m" RED = "\033[91m" _END = "\033[0m" _URL_RE = re.compile(r'https?://[^\s<>"\')\]]+', re.IGNORECASE) @dataclass(slots=True) class LinkResult: url: str status: int | None ok: bool sources: frozenset[Path] error: str | None = None async def extract_links(path: Path) -> set[str]: try: content = await anyio.to_thread.run_sync(path.read_text, "utf-8", "ignore") return {m.group(0).rstrip(".,)") for m in _URL_RE.finditer(content)} except Exception: return set() async def _probe(client: httpx.AsyncClient, url: str) -> LinkResult: try: r = await client.head(url, follow_redirects=True) if r.status_code in {405, 403, 404}: r = await client.get(url, follow_redirects=True) return LinkResult(url, r.status_code, 200 <= r.status_code < 400, frozenset()) except Exception as exc: return LinkResult(url, None, False, frozenset(), str(exc)) async def check_links(urls: Iterable[str], concurrency: int) -> list[LinkResult]: sem = anyio.Semaphore(concurrency) results: list[LinkResult] = [] async with httpx.AsyncClient(timeout=10) as client: async def bound(u: str) -> None: async with sem: results.append(await _probe(client, u)) async with anyio.create_task_group() as tg: for url in urls: tg.start_soon(bound, url) return results async def audit( paths: set[Path], ignored_prefixes: tuple[str, ...], concurrency: int, ) -> list[LinkResult]: link_to_files: dict[str, set[Path]] = {} async def process_file(p: Path) -> None: for url in await extract_links(p): if any(url.startswith(pref) for pref in ignored_prefixes): continue if re.search(r"{[^}]+}", url): # skip template tokens like {var} continue link_to_files.setdefault(url, set()).add(p) chunk_size = 100 for i in range(0, len(paths), chunk_size): paths_chunk = list(paths)[i : i + chunk_size] async with anyio.create_task_group() as tg: for path in paths_chunk: tg.start_soon(process_file, path) return [ LinkResult( url=r.url, status=r.status, ok=r.ok, sources=frozenset(link_to_files[r.url]), error=r.error, ) for r in await check_links(link_to_files, concurrency) ] async def main() -> None: parser = argparse.ArgumentParser( description="Fail the build if any HTTP link is unreachable." ) parser.add_argument("include", nargs="+", help="Glob pattern(s) to scan.") parser.add_argument( "--exclude", nargs="*", default=[], help="Glob pattern(s) to skip." ) parser.add_argument( "--ignore-url", nargs="*", default=("http://localhost", "https://localhost"), metavar="PREFIX", help="URL prefixes to ignore.", ) parser.add_argument("-c", "--concurrency", type=int, default=50) ns = parser.parse_args() include = {Path(p) for pat in ns.include for p in glob.glob(pat, recursive=True)} exclude = {Path(p) for pat in ns.exclude for p in glob.glob(pat, recursive=True)} if not (files := include - exclude): print("No files to scan.", file=sys.stderr) sys.exit(2) links = await audit(files, tuple(ns.ignore_url), concurrency=ns.concurrency) broken_links: list[LinkResult] = [] for r in sorted(links, key=lambda x: sorted(x.sources)[0].as_posix()): status = r.status or "ERR" icon = f"{GREEN}✓{_END}" if r.ok else f"{RED}✗{_END}" url_repr = r.url if r.ok else f"{RED}{r.url}{_END}" srcs = ", ".join(s.as_posix() for s in sorted(r.sources)) print(f"{GREY}{srcs}:{_END} {status:>4} {icon} {url_repr}") if not r.ok: broken_links.append(r) if broken_links: print(f"\n{len(broken_links)} broken link(s) detected.", file=sys.stderr) sys.exit(1) if __name__ == "__main__": anyio.run(main)