predict-github-stars at main · zzstoatzz.io/scripts

zzstoatzz.io / scripts
fork atom
for assorted things
fork atom
scripts / predict-github-stars
at main 298 lines 11 kB view raw
wrap content
zzstoatzz.io tell me when planes are above 7mo ago
e50060ad
  1#!/usr/bin/env -S uv run --script --quiet
  2# /// script
  3# requires-python = ">=3.12"
  4# dependencies = ["httpx", "rich", "numpy", "scikit-learn", "python-dateutil", "pandas", "pydantic-settings"]
  5# ///
  6"""
  7Predict when a GitHub repository will reach a target number of stars.
  8
  9Usage:
 10    ./predict-github-stars owner/repo 10000
 11
 12Details:
 13- Uses GitHub REST API to fetch star history (with timestamps).
 14- Fits polynomial regression (degree 1–3) to full history.
 15- Falls back to recent‑trend linear extrapolation if the polynomial
 16  cannot reach the target within ten years.
 17- Shows recent growth rate and a caution for long‑range estimates.
 18- Requires `GITHUB_TOKEN` in the environment for higher rate limits (optional).
 19"""
 20
 21from __future__ import annotations
 22
 23import argparse
 24import os
 25import sys
 26from datetime import datetime, timezone
 27
 28import httpx
 29import numpy as np
 30import pandas as pd
 31from dateutil import parser as date_parser
 32from pydantic import Field
 33from pydantic_settings import BaseSettings, SettingsConfigDict
 34from rich.console import Console
 35from rich.panel import Panel
 36from rich.table import Table
 37from sklearn.linear_model import LinearRegression
 38from sklearn.metrics import r2_score
 39from sklearn.preprocessing import PolynomialFeatures
 40
 41console = Console()
 42
 43
 44class Settings(BaseSettings):
 45    """Load settings (e.g. GitHub token) from environment."""
 46
 47    model_config = SettingsConfigDict(
 48        env_file=os.environ.get("ENV_FILE", ".env"), extra="ignore"
 49    )
 50    github_token: str = Field(default="", description="GitHub API token")
 51
 52
 53# ──────────────────────────────── GitHub helpers ────────────────────────────────
 54
 55
 56def _headers(token: str | None = None) -> dict[str, str]:
 57    h = {"Accept": "application/vnd.github.v3+json"}
 58    if token:
 59        h["Authorization"] = f"token {token}"
 60    return h
 61
 62
 63def get_repo_data(owner: str, repo: str, token: str | None = None) -> dict:
 64    url = f"https://api.github.com/repos/{owner}/{repo}"
 65    with httpx.Client() as c:
 66        r = c.get(url, headers=_headers(token))
 67        r.raise_for_status()
 68        return r.json()
 69
 70
 71def get_star_history(
 72    owner: str, repo: str, token: str | None, total_stars: int
 73) -> list[tuple[datetime, int]]:
 74    """Return (timestamp, cumulative_star_count) pairs, sampled if repo is huge."""
 75    hdrs = _headers(token)
 76    hdrs["Accept"] = "application/vnd.github.v3.star+json"  # need starred_at
 77
 78    history: list[tuple[datetime, int]] = []
 79
 80    if total_stars > 10_000:
 81        # sample ~200 evenly‑spaced star indices
 82        sample_points = 200
 83        step = max(1, total_stars // sample_points)
 84        pages_needed: dict[int, list[int]] = {}
 85        for s in range(1, total_stars, step):
 86            pg = (s - 1) // 100 + 1
 87            idx = (s - 1) % 100
 88            pages_needed.setdefault(pg, []).append(idx)
 89
 90        # always include final star
 91        last_pg = (total_stars - 1) // 100 + 1
 92        last_idx = (total_stars - 1) % 100
 93        pages_needed.setdefault(last_pg, []).append(last_idx)
 94
 95        with httpx.Client() as c:
 96            for pg, idxs in pages_needed.items():
 97                url = f"https://api.github.com/repos/{owner}/{repo}/stargazers?page={pg}&per_page=100"
 98                r = c.get(url, headers=hdrs)
 99                r.raise_for_status()
100                data = r.json()
101                for i in sorted(set(idxs)):
102                    if i < len(data) and "starred_at" in data[i]:
103                        ts = date_parser.parse(data[i]["starred_at"])
104                        history.append((ts, (pg - 1) * 100 + i + 1))
105
106        console.print(f"[dim]sampled {len(history)} points across star history[/dim]")
107
108    else:
109        # fetch all pages
110        page = 1
111        with httpx.Client() as c:
112            while True:
113                url = f"https://api.github.com/repos/{owner}/{repo}/stargazers?page={page}&per_page=100"
114                r = c.get(url, headers=hdrs)
115                r.raise_for_status()
116                data = r.json()
117                if not data:
118                    break
119                for i, star in enumerate(data):
120                    if "starred_at" in star:
121                        ts = date_parser.parse(star["starred_at"])
122                        history.append((ts, (page - 1) * 100 + i + 1))
123                page += 1
124
125    # ensure order and anchor today’s count
126    history.sort(key=lambda t: t[0])
127    if history and history[-1][1] < total_stars:
128        history.append((datetime.now(timezone.utc), total_stars))
129    return history
130
131
132# ──────────────────────────────── modelling ─────────────────────────────────────
133
134
135def best_poly_fit(
136    X: np.ndarray, y: np.ndarray
137) -> tuple[LinearRegression, PolynomialFeatures, int, float]:
138    best_r2 = -1.0
139    best_model: LinearRegression | None = None
140    best_poly: PolynomialFeatures | None = None
141    best_deg = 1
142    for deg in (1, 2, 3):
143        poly = PolynomialFeatures(degree=deg)
144        Xpoly = poly.fit_transform(X)
145        model = LinearRegression().fit(Xpoly, y)
146        r2 = r2_score(y, model.predict(Xpoly))
147        if r2 > best_r2:
148            best_r2, best_model, best_poly, best_deg = r2, model, poly, deg
149    return best_model, best_poly, best_deg, best_r2  # type: ignore
150
151
152def predict_date(history: list[tuple[datetime, int]], target: int) -> datetime | None:
153    if len(history) < 10:
154        return None
155    origin = history[0][0]
156    X = np.array([(t - origin).total_seconds() / 86400 for t, _ in history]).reshape(
157        -1, 1
158    )
159    y = np.array([s for _, s in history])
160
161    model, poly, deg, r2 = best_poly_fit(X, y)
162    console.print(f"[dim]best fit: degree {deg} polynomial (r² = {r2:.3f})[/dim]")
163
164    current_day = X[-1, 0]
165    for d in range(0, 3650):  # up to 10 years
166        future = current_day + d
167        if model.predict(poly.transform([[future]]))[0] >= target:
168            return origin + pd.Timedelta(days=future)
169    return None
170
171
172# ──────────────────────────────── utils ─────────────────────────────────────────
173
174
175def timeframe_str(dt: datetime) -> str:
176    now = datetime.now(timezone.utc)
177    if dt <= now:
178        return "already reached"
179    days = (dt - now).days
180    if days == 0:
181        return "today"
182    if days == 1:
183        return "tomorrow"
184    if days < 7:
185        return f"in {days} days"
186    if days < 30:
187        return f"in {days // 7} week(s)"
188    if days < 365:
189        return f"in {days // 30} month(s)"
190    return f"in {days // 365} year(s)"
191
192
193# ──────────────────────────────── main ──────────────────────────────────────────
194
195
196def main() -> None:
197    p = argparse.ArgumentParser(
198        description="Predict when a GitHub repo will reach a target number of stars"
199    )
200    p.add_argument("repo", help="owner/repo")
201    p.add_argument("stars", type=int, help="target star count")
202    args = p.parse_args()
203
204    if "/" not in args.repo:
205        console.print("[red]error: repo must be owner/repo[/red]")
206        sys.exit(1)
207    owner, repo = args.repo.split("/", 1)
208
209    try:
210        settings = Settings()  # load token
211    except Exception as e:  # pragma: no cover
212        console.print(f"[red]error loading settings: {e}[/red]")
213        sys.exit(1)
214    token = settings.github_token.strip() or None
215
216    try:
217        repo_data = get_repo_data(owner, repo, token)
218        current_stars = repo_data["stargazers_count"]
219        created_at = date_parser.parse(repo_data["created_at"])
220
221        console.print(
222            Panel.fit(
223                f"[bold cyan]{owner}/{repo}[/bold cyan]\n"
224                f"[dim]current stars: {current_stars:,}\ncreated: {created_at:%Y-%m-%d}[/dim]",
225                border_style="blue",
226            )
227        )
228
229        if current_stars >= args.stars:
230            console.print("\n[green]✓ already at or above target![/green]")
231            sys.exit(0)
232
233        console.print("\nfetching star history…")
234        history = get_star_history(owner, repo, token, current_stars)
235        if not history:
236            console.print("[red]error: no star history[/red]")
237            sys.exit(1)
238        if len(history) > 1000:  # down‑sample for speed
239            step = len(history) // 1000
240            history = history[::step] + [history[-1]]
241
242        console.print(f"[dim]analysing {len(history)} data points…[/dim]")
243        poly_date = predict_date(history, args.stars)
244
245        def recent_rate(window: int = 30) -> float:
246            cutoff = datetime.now(timezone.utc) - pd.Timedelta(days=window)
247            pts = [s for t, s in history if t >= cutoff]
248            return (pts[-1] - pts[0]) / window if len(pts) >= 2 else 0.0
249
250        rate = recent_rate() or recent_rate(90)
251
252        if poly_date:
253            out_date, tf = poly_date, timeframe_str(poly_date)
254        elif rate > 0:
255            days_needed = (args.stars - current_stars) / rate
256            out_date = datetime.now(timezone.utc) + pd.Timedelta(days=days_needed)
257            tf = timeframe_str(out_date)
258            console.print(
259                "[dim]poly model pessimistic; using recent growth trend[/dim]"
260            )
261        else:
262            console.print(
263                f"[red]✗ unlikely to reach {args.stars:,} stars in the next 10 years[/red]"
264            )
265            sys.exit(0)
266
267        table = Table(show_header=True, header_style="bold magenta")
268        table.add_column("metric")
269        table.add_column("value", style="white")
270        table.add_row("target stars", f"{args.stars:,}")
271        table.add_row("current stars", f"{current_stars:,}")
272        table.add_row("stars needed", f"{args.stars - current_stars:,}")
273        table.add_row("predicted date", out_date.strftime("%Y-%m-%d"))
274        table.add_row("timeframe", tf)
275        if rate:
276            table.add_row("recent growth", f"{rate:.1f} stars/day")
277
278        console.print()
279        console.print(table)
280        if "year" in tf and "1 year" not in tf:
281            console.print("\n[dim]⚠ prediction far in future; uncertainty high[/dim]")
282
283    except httpx.HTTPStatusError as e:
284        if e.response.status_code == 404:
285            msg = "repository not found"
286        elif e.response.status_code == 403:
287            msg = "rate limit exceeded (set GITHUB_TOKEN)"
288        else:
289            msg = f"GitHub API error {e.response.status_code}"
290        console.print(f"[red]error: {msg}[/red]")
291        sys.exit(1)
292    except Exception as e:  # pragma: no cover
293        console.print(f"[red]error: {e}[/red]")
294        sys.exit(1)
295
296
297if __name__ == "__main__":
298    main()