for assorted things
1#!/usr/bin/env -S uv run --script --quiet
2# /// script
3# requires-python = ">=3.12"
4# dependencies = ["httpx"]
5# ///
6"""
7Check files for bad links.
8
9Usage:
10
11```bash
12./check-files-for-bad-links *.md
13```
14
15Details:
16- uses [`httpx`](https://www.python-httpx.org/) to check links
17- uses [`anyio`](https://anyio.readthedocs.io/en/stable/) to run the checks concurrently
18- pass include globs to scan (e.g. `*.md`)
19- pass exclude globs to skip (e.g. `*.md`)
20- pass ignore-url prefixes to ignore (e.g. `http://localhost` or `https://localhost`)
21- pass concurrency to run the checks concurrently (default is 50)
22"""
23
24import argparse
25import glob
26import re
27import sys
28from dataclasses import dataclass
29from pathlib import Path
30from typing import Iterable
31
32import anyio
33import anyio.to_thread
34import httpx
35
36GREY = "\033[90m"
37GREEN = "\033[92m"
38RED = "\033[91m"
39_END = "\033[0m"
40_URL_RE = re.compile(r'https?://[^\s<>"\')\]]+', re.IGNORECASE)
41
42
43@dataclass(slots=True)
44class LinkResult:
45 url: str
46 status: int | None
47 ok: bool
48 sources: frozenset[Path]
49 error: str | None = None
50
51
52async def extract_links(path: Path) -> set[str]:
53 try:
54 content = await anyio.to_thread.run_sync(path.read_text, "utf-8", "ignore")
55 return {m.group(0).rstrip(".,)") for m in _URL_RE.finditer(content)}
56 except Exception:
57 return set()
58
59
60async def _probe(client: httpx.AsyncClient, url: str) -> LinkResult:
61 try:
62 r = await client.head(url, follow_redirects=True)
63 if r.status_code in {405, 403, 404}:
64 r = await client.get(url, follow_redirects=True)
65 return LinkResult(url, r.status_code, 200 <= r.status_code < 400, frozenset())
66 except Exception as exc:
67 return LinkResult(url, None, False, frozenset(), str(exc))
68
69
70async def check_links(urls: Iterable[str], concurrency: int) -> list[LinkResult]:
71 sem = anyio.Semaphore(concurrency)
72 results: list[LinkResult] = []
73
74 async with httpx.AsyncClient(timeout=10) as client:
75
76 async def bound(u: str) -> None:
77 async with sem:
78 results.append(await _probe(client, u))
79
80 async with anyio.create_task_group() as tg:
81 for url in urls:
82 tg.start_soon(bound, url)
83
84 return results
85
86
87async def audit(
88 paths: set[Path],
89 ignored_prefixes: tuple[str, ...],
90 concurrency: int,
91) -> list[LinkResult]:
92 link_to_files: dict[str, set[Path]] = {}
93
94 async def process_file(p: Path) -> None:
95 for url in await extract_links(p):
96 if any(url.startswith(pref) for pref in ignored_prefixes):
97 continue
98 if re.search(r"{[^}]+}", url): # skip template tokens like {var}
99 continue
100 link_to_files.setdefault(url, set()).add(p)
101
102 chunk_size = 100
103 for i in range(0, len(paths), chunk_size):
104 paths_chunk = list(paths)[i : i + chunk_size]
105 async with anyio.create_task_group() as tg:
106 for path in paths_chunk:
107 tg.start_soon(process_file, path)
108
109 return [
110 LinkResult(
111 url=r.url,
112 status=r.status,
113 ok=r.ok,
114 sources=frozenset(link_to_files[r.url]),
115 error=r.error,
116 )
117 for r in await check_links(link_to_files, concurrency)
118 ]
119
120
121async def main() -> None:
122 parser = argparse.ArgumentParser(
123 description="Fail the build if any HTTP link is unreachable."
124 )
125 parser.add_argument("include", nargs="+", help="Glob pattern(s) to scan.")
126 parser.add_argument(
127 "--exclude", nargs="*", default=[], help="Glob pattern(s) to skip."
128 )
129 parser.add_argument(
130 "--ignore-url",
131 nargs="*",
132 default=("http://localhost", "https://localhost"),
133 metavar="PREFIX",
134 help="URL prefixes to ignore.",
135 )
136 parser.add_argument("-c", "--concurrency", type=int, default=50)
137
138 ns = parser.parse_args()
139
140 include = {Path(p) for pat in ns.include for p in glob.glob(pat, recursive=True)}
141 exclude = {Path(p) for pat in ns.exclude for p in glob.glob(pat, recursive=True)}
142
143 if not (files := include - exclude):
144 print("No files to scan.", file=sys.stderr)
145 sys.exit(2)
146
147 links = await audit(files, tuple(ns.ignore_url), concurrency=ns.concurrency)
148
149 broken_links: list[LinkResult] = []
150 for r in sorted(links, key=lambda x: sorted(x.sources)[0].as_posix()):
151 status = r.status or "ERR"
152 icon = f"{GREEN}✓{_END}" if r.ok else f"{RED}✗{_END}"
153 url_repr = r.url if r.ok else f"{RED}{r.url}{_END}"
154 srcs = ", ".join(s.as_posix() for s in sorted(r.sources))
155
156 print(f"{GREY}{srcs}:{_END} {status:>4} {icon} {url_repr}")
157
158 if not r.ok:
159 broken_links.append(r)
160
161 if broken_links:
162 print(f"\n{len(broken_links)} broken link(s) detected.", file=sys.stderr)
163 sys.exit(1)
164
165
166if __name__ == "__main__":
167 anyio.run(main)