for assorted things
1#!/usr/bin/env -S uv run --script --quiet
2# /// script
3# requires-python = ">=3.12"
4# dependencies = ["httpx"]
5# ///
6"""
7Check files for bad links.
8"""
9
10import argparse
11import glob
12import re
13import sys
14from dataclasses import dataclass
15from pathlib import Path
16from typing import Iterable
17
18import anyio
19import anyio.to_thread
20import httpx
21
22GREY = "\033[90m"
23GREEN = "\033[92m"
24RED = "\033[91m"
25_END = "\033[0m"
26_URL_RE = re.compile(r'https?://[^\s<>"\')\]]+', re.IGNORECASE)
27
28
29@dataclass(slots=True)
30class LinkResult:
31 url: str
32 status: int | None
33 ok: bool
34 sources: frozenset[Path]
35 error: str | None = None
36
37
38async def extract_links(path: Path) -> set[str]:
39 try:
40 content = await anyio.to_thread.run_sync(path.read_text, "utf-8", "ignore")
41 return {m.group(0).rstrip(".,)") for m in _URL_RE.finditer(content)}
42 except Exception:
43 return set()
44
45
46async def _probe(client: httpx.AsyncClient, url: str) -> LinkResult:
47 try:
48 r = await client.head(url, follow_redirects=True)
49 if r.status_code in {405, 403}:
50 r = await client.get(url, follow_redirects=True)
51 return LinkResult(url, r.status_code, 200 <= r.status_code < 400, frozenset())
52 except Exception as exc:
53 return LinkResult(url, None, False, frozenset(), str(exc))
54
55
56async def check_links(urls: Iterable[str], concurrency: int) -> list[LinkResult]:
57 sem = anyio.Semaphore(concurrency)
58 results: list[LinkResult] = []
59
60 async with httpx.AsyncClient(timeout=10) as client:
61
62 async def bound(u: str) -> None:
63 async with sem:
64 results.append(await _probe(client, u))
65
66 async with anyio.create_task_group() as tg:
67 for url in urls:
68 tg.start_soon(bound, url)
69
70 return results
71
72
73async def audit(
74 paths: set[Path],
75 ignored_prefixes: tuple[str, ...],
76 concurrency: int,
77) -> list[LinkResult]:
78 link_to_files: dict[str, set[Path]] = {}
79
80 async def process_file(p: Path) -> None:
81 for url in await extract_links(p):
82 if any(url.startswith(pref) for pref in ignored_prefixes):
83 continue
84 if re.search(r"{[^}]+}", url): # skip template tokens like {var}
85 continue
86 link_to_files.setdefault(url, set()).add(p)
87
88 chunk_size = 100
89 for i in range(0, len(paths), chunk_size):
90 paths_chunk = list(paths)[i : i + chunk_size]
91 async with anyio.create_task_group() as tg:
92 for path in paths_chunk:
93 tg.start_soon(process_file, path)
94
95 return [
96 LinkResult(
97 url=r.url,
98 status=r.status,
99 ok=r.ok,
100 sources=frozenset(link_to_files[r.url]),
101 error=r.error,
102 )
103 for r in await check_links(link_to_files, concurrency)
104 ]
105
106
107async def main() -> None:
108 parser = argparse.ArgumentParser(
109 description="Fail the build if any HTTP link is unreachable."
110 )
111 parser.add_argument("include", nargs="+", help="Glob pattern(s) to scan.")
112 parser.add_argument(
113 "--exclude", nargs="*", default=[], help="Glob pattern(s) to skip."
114 )
115 parser.add_argument(
116 "--ignore-url",
117 nargs="*",
118 default=("http://localhost", "https://localhost"),
119 metavar="PREFIX",
120 help="URL prefixes to ignore.",
121 )
122 parser.add_argument("-c", "--concurrency", type=int, default=50)
123
124 ns = parser.parse_args()
125
126 include = {Path(p) for pat in ns.include for p in glob.glob(pat, recursive=True)}
127 exclude = {Path(p) for pat in ns.exclude for p in glob.glob(pat, recursive=True)}
128
129 if not (files := include - exclude):
130 print("No files to scan.", file=sys.stderr)
131 sys.exit(2)
132
133 links = await audit(files, tuple(ns.ignore_url), concurrency=ns.concurrency)
134
135 broken_links: list[LinkResult] = []
136 for r in sorted(links, key=lambda x: sorted(x.sources)[0].as_posix()):
137 status = r.status or "ERR"
138 icon = f"{GREEN}✓{_END}" if r.ok else f"{RED}✗{_END}"
139 url_repr = r.url if r.ok else f"{RED}{r.url}{_END}"
140 srcs = ", ".join(s.as_posix() for s in sorted(r.sources))
141
142 print(f"{GREY}{srcs}:{_END} {status:>4} {icon} {url_repr}")
143
144 if not r.ok:
145 broken_links.append(r)
146
147 if broken_links:
148 print(f"\n{len(broken_links)} broken link(s) detected.", file=sys.stderr)
149 sys.exit(1)
150
151
152if __name__ == "__main__":
153 anyio.run(main)