for assorted things
1#!/usr/bin/env -S uv run --script --quiet 2# /// script 3# requires-python = ">=3.12" 4# dependencies = ["httpx"] 5# /// 6""" 7Check files for bad links. 8""" 9 10import argparse 11import glob 12import re 13import sys 14from dataclasses import dataclass 15from pathlib import Path 16from typing import Iterable 17 18import anyio 19import anyio.to_thread 20import httpx 21 22GREY = "\033[90m" 23GREEN = "\033[92m" 24RED = "\033[91m" 25_END = "\033[0m" 26_URL_RE = re.compile(r'https?://[^\s<>"\')\]]+', re.IGNORECASE) 27 28 29@dataclass(slots=True) 30class LinkResult: 31 url: str 32 status: int | None 33 ok: bool 34 sources: frozenset[Path] 35 error: str | None = None 36 37 38async def extract_links(path: Path) -> set[str]: 39 try: 40 content = await anyio.to_thread.run_sync(path.read_text, "utf-8", "ignore") 41 return {m.group(0).rstrip(".,)") for m in _URL_RE.finditer(content)} 42 except Exception: 43 return set() 44 45 46async def _probe(client: httpx.AsyncClient, url: str) -> LinkResult: 47 try: 48 r = await client.head(url, follow_redirects=True) 49 if r.status_code in {405, 403}: 50 r = await client.get(url, follow_redirects=True) 51 return LinkResult(url, r.status_code, 200 <= r.status_code < 400, frozenset()) 52 except Exception as exc: 53 return LinkResult(url, None, False, frozenset(), str(exc)) 54 55 56async def check_links(urls: Iterable[str], concurrency: int) -> list[LinkResult]: 57 sem = anyio.Semaphore(concurrency) 58 results: list[LinkResult] = [] 59 60 async with httpx.AsyncClient(timeout=10) as client: 61 62 async def bound(u: str) -> None: 63 async with sem: 64 results.append(await _probe(client, u)) 65 66 async with anyio.create_task_group() as tg: 67 for url in urls: 68 tg.start_soon(bound, url) 69 70 return results 71 72 73async def audit( 74 paths: set[Path], 75 ignored_prefixes: tuple[str, ...], 76 concurrency: int, 77) -> list[LinkResult]: 78 link_to_files: dict[str, set[Path]] = {} 79 80 async def process_file(p: Path) -> None: 81 for url in await extract_links(p): 82 if any(url.startswith(pref) for pref in ignored_prefixes): 83 continue 84 if re.search(r"{[^}]+}", url): # skip template tokens like {var} 85 continue 86 link_to_files.setdefault(url, set()).add(p) 87 88 chunk_size = 100 89 for i in range(0, len(paths), chunk_size): 90 paths_chunk = list(paths)[i : i + chunk_size] 91 async with anyio.create_task_group() as tg: 92 for path in paths_chunk: 93 tg.start_soon(process_file, path) 94 95 return [ 96 LinkResult( 97 url=r.url, 98 status=r.status, 99 ok=r.ok, 100 sources=frozenset(link_to_files[r.url]), 101 error=r.error, 102 ) 103 for r in await check_links(link_to_files, concurrency) 104 ] 105 106 107async def main() -> None: 108 parser = argparse.ArgumentParser( 109 description="Fail the build if any HTTP link is unreachable." 110 ) 111 parser.add_argument("include", nargs="+", help="Glob pattern(s) to scan.") 112 parser.add_argument( 113 "--exclude", nargs="*", default=[], help="Glob pattern(s) to skip." 114 ) 115 parser.add_argument( 116 "--ignore-url", 117 nargs="*", 118 default=("http://localhost", "https://localhost"), 119 metavar="PREFIX", 120 help="URL prefixes to ignore.", 121 ) 122 parser.add_argument("-c", "--concurrency", type=int, default=50) 123 124 ns = parser.parse_args() 125 126 include = {Path(p) for pat in ns.include for p in glob.glob(pat, recursive=True)} 127 exclude = {Path(p) for pat in ns.exclude for p in glob.glob(pat, recursive=True)} 128 129 if not (files := include - exclude): 130 print("No files to scan.", file=sys.stderr) 131 sys.exit(2) 132 133 links = await audit(files, tuple(ns.ignore_url), concurrency=ns.concurrency) 134 135 broken_links: list[LinkResult] = [] 136 for r in sorted(links, key=lambda x: sorted(x.sources)[0].as_posix()): 137 status = r.status or "ERR" 138 icon = f"{GREEN}✓{_END}" if r.ok else f"{RED}✗{_END}" 139 url_repr = r.url if r.ok else f"{RED}{r.url}{_END}" 140 srcs = ", ".join(s.as_posix() for s in sorted(r.sources)) 141 142 print(f"{GREY}{srcs}:{_END} {status:>4} {icon} {url_repr}") 143 144 if not r.ok: 145 broken_links.append(r) 146 147 if broken_links: 148 print(f"\n{len(broken_links)} broken link(s) detected.", file=sys.stderr) 149 sys.exit(1) 150 151 152if __name__ == "__main__": 153 anyio.run(main)