for assorted things
at main 4.9 kB view raw
1#!/usr/bin/env -S uv run --script --quiet 2# /// script 3# requires-python = ">=3.12" 4# dependencies = ["httpx"] 5# /// 6""" 7Check files for bad links. 8 9Usage: 10 11```bash 12./check-files-for-bad-links *.md 13``` 14 15Details: 16- uses [`httpx`](https://www.python-httpx.org/) to check links 17- uses [`anyio`](https://anyio.readthedocs.io/en/stable/) to run the checks concurrently 18- pass include globs to scan (e.g. `*.md`) 19- pass exclude globs to skip (e.g. `*.md`) 20- pass ignore-url prefixes to ignore (e.g. `http://localhost` or `https://localhost`) 21- pass concurrency to run the checks concurrently (default is 50) 22""" 23 24import argparse 25import glob 26import re 27import sys 28from dataclasses import dataclass 29from pathlib import Path 30from typing import Iterable 31 32import anyio 33import anyio.to_thread 34import httpx 35 36GREY = "\033[90m" 37GREEN = "\033[92m" 38RED = "\033[91m" 39_END = "\033[0m" 40_URL_RE = re.compile(r'https?://[^\s<>"\')\]]+', re.IGNORECASE) 41 42 43@dataclass(slots=True) 44class LinkResult: 45 url: str 46 status: int | None 47 ok: bool 48 sources: frozenset[Path] 49 error: str | None = None 50 51 52async def extract_links(path: Path) -> set[str]: 53 try: 54 content = await anyio.to_thread.run_sync(path.read_text, "utf-8", "ignore") 55 return {m.group(0).rstrip(".,)") for m in _URL_RE.finditer(content)} 56 except Exception: 57 return set() 58 59 60async def _probe(client: httpx.AsyncClient, url: str) -> LinkResult: 61 try: 62 r = await client.head(url, follow_redirects=True) 63 if r.status_code in {405, 403, 404}: 64 r = await client.get(url, follow_redirects=True) 65 return LinkResult(url, r.status_code, 200 <= r.status_code < 400, frozenset()) 66 except Exception as exc: 67 return LinkResult(url, None, False, frozenset(), str(exc)) 68 69 70async def check_links(urls: Iterable[str], concurrency: int) -> list[LinkResult]: 71 sem = anyio.Semaphore(concurrency) 72 results: list[LinkResult] = [] 73 74 async with httpx.AsyncClient(timeout=10) as client: 75 76 async def bound(u: str) -> None: 77 async with sem: 78 results.append(await _probe(client, u)) 79 80 async with anyio.create_task_group() as tg: 81 for url in urls: 82 tg.start_soon(bound, url) 83 84 return results 85 86 87async def audit( 88 paths: set[Path], 89 ignored_prefixes: tuple[str, ...], 90 concurrency: int, 91) -> list[LinkResult]: 92 link_to_files: dict[str, set[Path]] = {} 93 94 async def process_file(p: Path) -> None: 95 for url in await extract_links(p): 96 if any(url.startswith(pref) for pref in ignored_prefixes): 97 continue 98 if re.search(r"{[^}]+}", url): # skip template tokens like {var} 99 continue 100 link_to_files.setdefault(url, set()).add(p) 101 102 chunk_size = 100 103 for i in range(0, len(paths), chunk_size): 104 paths_chunk = list(paths)[i : i + chunk_size] 105 async with anyio.create_task_group() as tg: 106 for path in paths_chunk: 107 tg.start_soon(process_file, path) 108 109 return [ 110 LinkResult( 111 url=r.url, 112 status=r.status, 113 ok=r.ok, 114 sources=frozenset(link_to_files[r.url]), 115 error=r.error, 116 ) 117 for r in await check_links(link_to_files, concurrency) 118 ] 119 120 121async def main() -> None: 122 parser = argparse.ArgumentParser( 123 description="Fail the build if any HTTP link is unreachable." 124 ) 125 parser.add_argument("include", nargs="+", help="Glob pattern(s) to scan.") 126 parser.add_argument( 127 "--exclude", nargs="*", default=[], help="Glob pattern(s) to skip." 128 ) 129 parser.add_argument( 130 "--ignore-url", 131 nargs="*", 132 default=("http://localhost", "https://localhost"), 133 metavar="PREFIX", 134 help="URL prefixes to ignore.", 135 ) 136 parser.add_argument("-c", "--concurrency", type=int, default=50) 137 138 ns = parser.parse_args() 139 140 include = {Path(p) for pat in ns.include for p in glob.glob(pat, recursive=True)} 141 exclude = {Path(p) for pat in ns.exclude for p in glob.glob(pat, recursive=True)} 142 143 if not (files := include - exclude): 144 print("No files to scan.", file=sys.stderr) 145 sys.exit(2) 146 147 links = await audit(files, tuple(ns.ignore_url), concurrency=ns.concurrency) 148 149 broken_links: list[LinkResult] = [] 150 for r in sorted(links, key=lambda x: sorted(x.sources)[0].as_posix()): 151 status = r.status or "ERR" 152 icon = f"{GREEN}✓{_END}" if r.ok else f"{RED}✗{_END}" 153 url_repr = r.url if r.ok else f"{RED}{r.url}{_END}" 154 srcs = ", ".join(s.as_posix() for s in sorted(r.sources)) 155 156 print(f"{GREY}{srcs}:{_END} {status:>4} {icon} {url_repr}") 157 158 if not r.ok: 159 broken_links.append(r) 160 161 if broken_links: 162 print(f"\n{len(broken_links)} broken link(s) detected.", file=sys.stderr) 163 sys.exit(1) 164 165 166if __name__ == "__main__": 167 anyio.run(main)