for assorted things
at main 11 kB view raw
1#!/usr/bin/env -S uv run --script --quiet 2# /// script 3# requires-python = ">=3.12" 4# dependencies = ["httpx", "rich", "numpy", "scikit-learn", "python-dateutil", "pandas", "pydantic-settings"] 5# /// 6""" 7Predict when a GitHub repository will reach a target number of stars. 8 9Usage: 10 ./predict-github-stars owner/repo 10000 11 12Details: 13- Uses GitHub REST API to fetch star history (with timestamps). 14- Fits polynomial regression (degree 1–3) to full history. 15- Falls back to recent‑trend linear extrapolation if the polynomial 16 cannot reach the target within ten years. 17- Shows recent growth rate and a caution for long‑range estimates. 18- Requires `GITHUB_TOKEN` in the environment for higher rate limits (optional). 19""" 20 21from __future__ import annotations 22 23import argparse 24import os 25import sys 26from datetime import datetime, timezone 27 28import httpx 29import numpy as np 30import pandas as pd 31from dateutil import parser as date_parser 32from pydantic import Field 33from pydantic_settings import BaseSettings, SettingsConfigDict 34from rich.console import Console 35from rich.panel import Panel 36from rich.table import Table 37from sklearn.linear_model import LinearRegression 38from sklearn.metrics import r2_score 39from sklearn.preprocessing import PolynomialFeatures 40 41console = Console() 42 43 44class Settings(BaseSettings): 45 """Load settings (e.g. GitHub token) from environment.""" 46 47 model_config = SettingsConfigDict( 48 env_file=os.environ.get("ENV_FILE", ".env"), extra="ignore" 49 ) 50 github_token: str = Field(default="", description="GitHub API token") 51 52 53# ──────────────────────────────── GitHub helpers ──────────────────────────────── 54 55 56def _headers(token: str | None = None) -> dict[str, str]: 57 h = {"Accept": "application/vnd.github.v3+json"} 58 if token: 59 h["Authorization"] = f"token {token}" 60 return h 61 62 63def get_repo_data(owner: str, repo: str, token: str | None = None) -> dict: 64 url = f"https://api.github.com/repos/{owner}/{repo}" 65 with httpx.Client() as c: 66 r = c.get(url, headers=_headers(token)) 67 r.raise_for_status() 68 return r.json() 69 70 71def get_star_history( 72 owner: str, repo: str, token: str | None, total_stars: int 73) -> list[tuple[datetime, int]]: 74 """Return (timestamp, cumulative_star_count) pairs, sampled if repo is huge.""" 75 hdrs = _headers(token) 76 hdrs["Accept"] = "application/vnd.github.v3.star+json" # need starred_at 77 78 history: list[tuple[datetime, int]] = [] 79 80 if total_stars > 10_000: 81 # sample ~200 evenly‑spaced star indices 82 sample_points = 200 83 step = max(1, total_stars // sample_points) 84 pages_needed: dict[int, list[int]] = {} 85 for s in range(1, total_stars, step): 86 pg = (s - 1) // 100 + 1 87 idx = (s - 1) % 100 88 pages_needed.setdefault(pg, []).append(idx) 89 90 # always include final star 91 last_pg = (total_stars - 1) // 100 + 1 92 last_idx = (total_stars - 1) % 100 93 pages_needed.setdefault(last_pg, []).append(last_idx) 94 95 with httpx.Client() as c: 96 for pg, idxs in pages_needed.items(): 97 url = f"https://api.github.com/repos/{owner}/{repo}/stargazers?page={pg}&per_page=100" 98 r = c.get(url, headers=hdrs) 99 r.raise_for_status() 100 data = r.json() 101 for i in sorted(set(idxs)): 102 if i < len(data) and "starred_at" in data[i]: 103 ts = date_parser.parse(data[i]["starred_at"]) 104 history.append((ts, (pg - 1) * 100 + i + 1)) 105 106 console.print(f"[dim]sampled {len(history)} points across star history[/dim]") 107 108 else: 109 # fetch all pages 110 page = 1 111 with httpx.Client() as c: 112 while True: 113 url = f"https://api.github.com/repos/{owner}/{repo}/stargazers?page={page}&per_page=100" 114 r = c.get(url, headers=hdrs) 115 r.raise_for_status() 116 data = r.json() 117 if not data: 118 break 119 for i, star in enumerate(data): 120 if "starred_at" in star: 121 ts = date_parser.parse(star["starred_at"]) 122 history.append((ts, (page - 1) * 100 + i + 1)) 123 page += 1 124 125 # ensure order and anchor today’s count 126 history.sort(key=lambda t: t[0]) 127 if history and history[-1][1] < total_stars: 128 history.append((datetime.now(timezone.utc), total_stars)) 129 return history 130 131 132# ──────────────────────────────── modelling ───────────────────────────────────── 133 134 135def best_poly_fit( 136 X: np.ndarray, y: np.ndarray 137) -> tuple[LinearRegression, PolynomialFeatures, int, float]: 138 best_r2 = -1.0 139 best_model: LinearRegression | None = None 140 best_poly: PolynomialFeatures | None = None 141 best_deg = 1 142 for deg in (1, 2, 3): 143 poly = PolynomialFeatures(degree=deg) 144 Xpoly = poly.fit_transform(X) 145 model = LinearRegression().fit(Xpoly, y) 146 r2 = r2_score(y, model.predict(Xpoly)) 147 if r2 > best_r2: 148 best_r2, best_model, best_poly, best_deg = r2, model, poly, deg 149 return best_model, best_poly, best_deg, best_r2 # type: ignore 150 151 152def predict_date(history: list[tuple[datetime, int]], target: int) -> datetime | None: 153 if len(history) < 10: 154 return None 155 origin = history[0][0] 156 X = np.array([(t - origin).total_seconds() / 86400 for t, _ in history]).reshape( 157 -1, 1 158 ) 159 y = np.array([s for _, s in history]) 160 161 model, poly, deg, r2 = best_poly_fit(X, y) 162 console.print(f"[dim]best fit: degree {deg} polynomial (r² = {r2:.3f})[/dim]") 163 164 current_day = X[-1, 0] 165 for d in range(0, 3650): # up to 10 years 166 future = current_day + d 167 if model.predict(poly.transform([[future]]))[0] >= target: 168 return origin + pd.Timedelta(days=future) 169 return None 170 171 172# ──────────────────────────────── utils ───────────────────────────────────────── 173 174 175def timeframe_str(dt: datetime) -> str: 176 now = datetime.now(timezone.utc) 177 if dt <= now: 178 return "already reached" 179 days = (dt - now).days 180 if days == 0: 181 return "today" 182 if days == 1: 183 return "tomorrow" 184 if days < 7: 185 return f"in {days} days" 186 if days < 30: 187 return f"in {days // 7} week(s)" 188 if days < 365: 189 return f"in {days // 30} month(s)" 190 return f"in {days // 365} year(s)" 191 192 193# ──────────────────────────────── main ────────────────────────────────────────── 194 195 196def main() -> None: 197 p = argparse.ArgumentParser( 198 description="Predict when a GitHub repo will reach a target number of stars" 199 ) 200 p.add_argument("repo", help="owner/repo") 201 p.add_argument("stars", type=int, help="target star count") 202 args = p.parse_args() 203 204 if "/" not in args.repo: 205 console.print("[red]error: repo must be owner/repo[/red]") 206 sys.exit(1) 207 owner, repo = args.repo.split("/", 1) 208 209 try: 210 settings = Settings() # load token 211 except Exception as e: # pragma: no cover 212 console.print(f"[red]error loading settings: {e}[/red]") 213 sys.exit(1) 214 token = settings.github_token.strip() or None 215 216 try: 217 repo_data = get_repo_data(owner, repo, token) 218 current_stars = repo_data["stargazers_count"] 219 created_at = date_parser.parse(repo_data["created_at"]) 220 221 console.print( 222 Panel.fit( 223 f"[bold cyan]{owner}/{repo}[/bold cyan]\n" 224 f"[dim]current stars: {current_stars:,}\ncreated: {created_at:%Y-%m-%d}[/dim]", 225 border_style="blue", 226 ) 227 ) 228 229 if current_stars >= args.stars: 230 console.print("\n[green]✓ already at or above target![/green]") 231 sys.exit(0) 232 233 console.print("\nfetching star history…") 234 history = get_star_history(owner, repo, token, current_stars) 235 if not history: 236 console.print("[red]error: no star history[/red]") 237 sys.exit(1) 238 if len(history) > 1000: # down‑sample for speed 239 step = len(history) // 1000 240 history = history[::step] + [history[-1]] 241 242 console.print(f"[dim]analysing {len(history)} data points…[/dim]") 243 poly_date = predict_date(history, args.stars) 244 245 def recent_rate(window: int = 30) -> float: 246 cutoff = datetime.now(timezone.utc) - pd.Timedelta(days=window) 247 pts = [s for t, s in history if t >= cutoff] 248 return (pts[-1] - pts[0]) / window if len(pts) >= 2 else 0.0 249 250 rate = recent_rate() or recent_rate(90) 251 252 if poly_date: 253 out_date, tf = poly_date, timeframe_str(poly_date) 254 elif rate > 0: 255 days_needed = (args.stars - current_stars) / rate 256 out_date = datetime.now(timezone.utc) + pd.Timedelta(days=days_needed) 257 tf = timeframe_str(out_date) 258 console.print( 259 "[dim]poly model pessimistic; using recent growth trend[/dim]" 260 ) 261 else: 262 console.print( 263 f"[red]✗ unlikely to reach {args.stars:,} stars in the next 10 years[/red]" 264 ) 265 sys.exit(0) 266 267 table = Table(show_header=True, header_style="bold magenta") 268 table.add_column("metric") 269 table.add_column("value", style="white") 270 table.add_row("target stars", f"{args.stars:,}") 271 table.add_row("current stars", f"{current_stars:,}") 272 table.add_row("stars needed", f"{args.stars - current_stars:,}") 273 table.add_row("predicted date", out_date.strftime("%Y-%m-%d")) 274 table.add_row("timeframe", tf) 275 if rate: 276 table.add_row("recent growth", f"{rate:.1f} stars/day") 277 278 console.print() 279 console.print(table) 280 if "year" in tf and "1 year" not in tf: 281 console.print("\n[dim]⚠ prediction far in future; uncertainty high[/dim]") 282 283 except httpx.HTTPStatusError as e: 284 if e.response.status_code == 404: 285 msg = "repository not found" 286 elif e.response.status_code == 403: 287 msg = "rate limit exceeded (set GITHUB_TOKEN)" 288 else: 289 msg = f"GitHub API error {e.response.status_code}" 290 console.print(f"[red]error: {msg}[/red]") 291 sys.exit(1) 292 except Exception as e: # pragma: no cover 293 console.print(f"[red]error: {e}[/red]") 294 sys.exit(1) 295 296 297if __name__ == "__main__": 298 main()