init: daily vulnerability watch automation

scripts used by the cron vuln-watch workflow from the master branch.
2026-06-24 23:43:01 +02:00 · 2026-04-19 08:25:16 +00:00
commit 94356c4992
7 changed files with 904 additions and 0 deletions
@@ -0,0 +1,350 @@
+#!/usr/bin/env python3
+"""Fetch all configured sources, dedup against state/seen.json, emit new_items.json.
+
+Writes updated per-source HTTP cache metadata (etag, last_modified, hwm_*) back
+into state/seen.json. Does NOT touch state.seen / state.aliases — that is the
+merge step's job, after Claude has classified the new items.
+
+Usage:
+    SCAN_DATE=2026-04-18T14:24:43Z python -m scripts.vuln_watch.fetch_and_diff
+"""
+from __future__ import annotations
+
+import argparse
+import datetime
+import gzip
+import json
+import os
+import pathlib
+import re
+import sys
+import urllib.error
+import urllib.parse
+import urllib.request
+from typing import Any, Iterable
+
+import feedparser  # type: ignore[import-untyped]
+
+from .sources import REQUEST_TIMEOUT, SOURCES, Source, USER_AGENT
+from . import state
+
+
+CVE_RE = re.compile(r"CVE-\d{4}-\d{4,7}")
+DEFAULT_WINDOW_HOURS = 25
+MAX_ITEMS_PER_FEED = 200
+SNIPPET_MAX = 400
+NEW_ITEMS_PATH = pathlib.Path("new_items.json")
+
+
+def parse_iso(ts: str | None) -> datetime.datetime | None:
+    if not ts:
+        return None
+    try:
+        return datetime.datetime.fromisoformat(ts.replace("Z", "+00:00"))
+    except ValueError:
+        return None
+
+
+def now_from_scan_date(scan_date: str) -> datetime.datetime:
+    dt = parse_iso(scan_date)
+    if dt is None:
+        dt = datetime.datetime.now(datetime.timezone.utc)
+    return dt
+
+
+def conditional_get(
+    url: str,
+    etag: str | None,
+    last_modified: str | None,
+    user_agent: str = USER_AGENT,
+) -> tuple[int | str, bytes | None, str | None, str | None]:
+    """Perform a conditional GET.
+
+    Returns (status, body, new_etag, new_last_modified).
+
+    status is:
+      - 200 with body on success
+      - 304 with body=None when unchanged
+      - an int HTTP error code on server-side errors
+      - a string describing a network/transport failure
+    """
+    req = urllib.request.Request(url, headers={
+        "User-Agent": user_agent,
+        # AMD's CDN stalls on non-gzip clients; asking for gzip speeds up
+        # every source and is strictly beneficial (we decompress locally).
+        "Accept-Encoding": "gzip",
+    })
+    if etag:
+        req.add_header("If-None-Match", etag)
+    if last_modified:
+        req.add_header("If-Modified-Since", last_modified)
+    try:
+        with urllib.request.urlopen(req, timeout=REQUEST_TIMEOUT) as resp:
+            body = resp.read()
+            if resp.headers.get("Content-Encoding", "").lower() == "gzip":
+                try:
+                    body = gzip.decompress(body)
+                except OSError:
+                    pass  # server lied about encoding; use as-is
+            return (
+                resp.status,
+                body,
+                resp.headers.get("ETag"),
+                resp.headers.get("Last-Modified"),
+            )
+    except urllib.error.HTTPError as e:
+        if e.code == 304:
+            return (304, None, etag, last_modified)
+        return (e.code, None, etag, last_modified)
+    except (urllib.error.URLError, TimeoutError, OSError) as e:
+        return (f"network:{type(e).__name__}", None, etag, last_modified)
+
+
+def extract_cves(text: str) -> list[str]:
+    seen: set[str] = set()
+    out: list[str] = []
+    for m in CVE_RE.findall(text or ""):
+        if m not in seen:
+            seen.add(m)
+            out.append(m)
+    return out
+
+
+def extract_vendor_ids(text: str, patterns: Iterable[str]) -> list[str]:
+    seen: set[str] = set()
+    out: list[str] = []
+    for p in patterns:
+        for m in re.findall(p, text or ""):
+            if m not in seen:
+                seen.add(m)
+                out.append(m)
+    return out
+
+
+def pick_stable_id(vendor_ids: list[str], cves: list[str], guid: str, link: str) -> str:
+    """Pick canonical-ish stable ID: vendor advisory → CVE → guid → permalink.
+
+    CVE is preferred over guid/URL so that the same CVE seen via different
+    feeds collapses on its stable_id alone (in addition to the alias map).
+    """
+    if vendor_ids:
+        return vendor_ids[0]
+    if cves:
+        return cves[0]
+    if guid:
+        return guid
+    return link
+
+
+def clean_snippet(s: str) -> str:
+    s = re.sub(r"<[^>]+>", " ", s or "")
+    s = re.sub(r"\s+", " ", s)
+    return s.strip()
+
+
+def _struct_time_to_iso(st: Any) -> str | None:
+    if not st:
+        return None
+    try:
+        return datetime.datetime(*st[:6], tzinfo=datetime.timezone.utc).isoformat()
+    except (TypeError, ValueError):
+        return None
+
+
+def parse_feed_body(src: Source, body: bytes) -> list[dict[str, Any]]:
+    parsed = feedparser.parse(body)
+    items: list[dict[str, Any]] = []
+    for entry in parsed.entries[:MAX_ITEMS_PER_FEED]:
+        link = (entry.get("link") or "").strip()
+        guid = (entry.get("id") or entry.get("guid") or "").strip()
+        title = (entry.get("title") or "").strip()
+        summary = entry.get("summary") or ""
+        published_at = (
+            _struct_time_to_iso(entry.get("published_parsed"))
+            or _struct_time_to_iso(entry.get("updated_parsed"))
+        )
+        blob = f"{title}\n{summary}"
+        cves = extract_cves(blob)
+        vendor_ids = extract_vendor_ids(blob, src.advisory_id_patterns)
+        stable_id = pick_stable_id(vendor_ids, cves, guid, link)
+        items.append({
+            "source": src.name,
+            "stable_id": stable_id,
+            "title": title,
+            "permalink": link,
+            "guid": guid,
+            "published_at": published_at,
+            "extracted_cves": cves,
+            "vendor_ids": vendor_ids,
+            "snippet": clean_snippet(summary)[:SNIPPET_MAX],
+        })
+    return items
+
+
+def parse_html_body(src: Source, body: bytes) -> list[dict[str, Any]]:
+    """Scrape a non-RSS HTML page for advisory IDs and their nearest anchor."""
+    text = body.decode("utf-8", errors="replace")
+    items: list[dict[str, Any]] = []
+    seen_ids: set[str] = set()
+    permalink_base = src.display_url or src.url
+    for pat in src.advisory_id_patterns:
+        for m in re.finditer(pat, text):
+            advisory_id = m.group(0)
+            if advisory_id in seen_ids:
+                continue
+            seen_ids.add(advisory_id)
+            window = text[max(0, m.start() - 400): m.end() + 400]
+            href_match = re.search(r'href="([^"#]+)"', window)
+            if href_match:
+                permalink = urllib.parse.urljoin(permalink_base, href_match.group(1))
+            else:
+                permalink = permalink_base
+            cves_in_window = extract_cves(window)
+            is_cve = advisory_id.startswith("CVE-")
+            cves = cves_in_window if not is_cve else list({advisory_id, *cves_in_window})
+            vendor_ids = [] if is_cve else [advisory_id]
+            items.append({
+                "source": src.name,
+                "stable_id": advisory_id,
+                "title": advisory_id,
+                "permalink": permalink,
+                "guid": "",
+                "published_at": None,  # HTML pages often lack reliable dates
+                "extracted_cves": cves,
+                "vendor_ids": vendor_ids,
+                "snippet": clean_snippet(window)[:SNIPPET_MAX],
+            })
+    return items
+
+
+def parse_body(src: Source, body: bytes) -> list[dict[str, Any]]:
+    return parse_feed_body(src, body) if src.kind in ("rss", "atom") else parse_html_body(src, body)
+
+
+def compute_cutoff(scan_now: datetime.datetime, last_run: str | None) -> datetime.datetime:
+    base = scan_now - datetime.timedelta(hours=DEFAULT_WINDOW_HOURS)
+    lr = parse_iso(last_run)
+    if lr is None:
+        return base
+    widened = scan_now - (scan_now - lr + datetime.timedelta(hours=1))
+    return min(base, widened)
+
+
+def candidate_ids(item: dict[str, Any]) -> list[str]:
+    """All identifiers under which this item might already be known."""
+    seen: set[str] = set()
+    out: list[str] = []
+    for cand in (
+        *(item.get("extracted_cves") or []),
+        *(item.get("vendor_ids") or []),
+        item.get("stable_id"),
+        item.get("guid"),
+        item.get("permalink"),
+    ):
+        if cand and cand not in seen:
+            seen.add(cand)
+            out.append(cand)
+    return out
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--scan-date", default=os.environ.get("SCAN_DATE", ""))
+    ap.add_argument("--output", type=pathlib.Path, default=NEW_ITEMS_PATH)
+    args = ap.parse_args()
+
+    scan_now = now_from_scan_date(args.scan_date)
+    scan_date_iso = scan_now.isoformat()
+    data = state.load()
+    cutoff = compute_cutoff(scan_now, data.get("last_run"))
+
+    per_source: dict[str, dict[str, Any]] = {}
+    all_new: list[dict[str, Any]] = []
+
+    for src in SOURCES:
+        meta = dict(data["sources"].get(src.name, {}))
+        status, body, etag, last_modified = conditional_get(
+            src.url, meta.get("etag"), meta.get("last_modified"),
+            user_agent=src.user_agent or USER_AGENT,
+        )
+        meta["last_fetched_at"] = scan_date_iso
+        meta["last_status"] = status
+
+        if isinstance(status, str) or (isinstance(status, int) and status >= 400 and status != 304):
+            per_source[src.name] = {"status": status, "new": 0}
+            data["sources"][src.name] = meta
+            continue
+
+        if status == 304 or body is None:
+            per_source[src.name] = {"status": 304, "new": 0}
+            data["sources"][src.name] = meta
+            continue
+
+        # Refresh cache headers only on successful 200.
+        if etag:
+            meta["etag"] = etag
+        if last_modified:
+            meta["last_modified"] = last_modified
+
+        items = parse_body(src, body)
+        total = len(items)
+
+        in_window = []
+        for it in items:
+            pub = parse_iso(it.get("published_at"))
+            if pub is None or pub >= cutoff:
+                in_window.append(it)
+
+        new: list[dict[str, Any]] = []
+        hwm_pub = meta.get("hwm_published_at")
+        hwm_id = meta.get("hwm_id")
+        for it in in_window:
+            if state.lookup(data, candidate_ids(it)) is not None:
+                continue
+            new.append(it)
+            pub = it.get("published_at")
+            if pub and (not hwm_pub or pub > hwm_pub):
+                hwm_pub = pub
+                hwm_id = it.get("stable_id")
+
+        if new:
+            meta["hwm_published_at"] = hwm_pub
+            meta["hwm_id"] = hwm_id
+
+        data["sources"][src.name] = meta
+        per_source[src.name] = {"status": status, "new": len(new), "total_in_feed": total}
+        all_new.extend(new)
+
+    # Persist updated HTTP cache metadata regardless of whether Claude runs.
+    state.save(data)
+
+    out = {
+        "scan_date": scan_date_iso,
+        "window_cutoff": cutoff.isoformat(),
+        "per_source": per_source,
+        "items": all_new,
+    }
+    args.output.write_text(json.dumps(out, indent=2, sort_keys=True) + "\n")
+
+    # GitHub Actions step outputs
+    gh_out = os.environ.get("GITHUB_OUTPUT")
+    if gh_out:
+        with open(gh_out, "a") as f:
+            f.write(f"new_count={len(all_new)}\n")
+            failures = [
+                s for s, v in per_source.items()
+                if not (isinstance(v["status"], int) and v["status"] in (200, 304))
+            ]
+            f.write(f"fetch_failures_count={len(failures)}\n")
+
+    print(f"Scan date:   {scan_date_iso}")
+    print(f"Cutoff:      {cutoff.isoformat()}")
+    print(f"New items:   {len(all_new)}")
+    for s, v in per_source.items():
+        print(f"  {s:14s} status={str(v['status']):>16} new={v['new']}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+"""Merge Claude's classifications.json into state/seen.json.
+
+Inputs:
+    state/seen.json          (already has updated .sources from fetch_and_diff)
+    classifications.json     (written by the Claude step; list of records)
+    new_items.json           (fallback source of per-item metadata, if Claude
+                              omitted urls/sources in a record)
+
+Each classification record has shape:
+    {
+      "stable_id":      "...",           # required (the key used in new_items.json)
+      "canonical_id":   "...",           # optional; defaults to first extracted_cves, else stable_id
+      "bucket":         "toimplement|tocheck|unrelated",
+      "extracted_cves": ["...", ...],    # optional
+      "sources":        ["...", ...],    # optional
+      "urls":           ["...", ...]     # optional
+    }
+
+Behavior:
+    - Upsert seen[canonical_id], union sources/urls, promote bucket strength.
+    - For every alt_id in (stable_id, vendor_ids, extracted_cves) that differs
+      from canonical_id, set aliases[alt_id] = canonical_id.
+    - Update last_run to SCAN_DATE.
+    - Prune entries older than RETENTION_DAYS (180) before writing.
+    - Also writes the three daily watch_*.md files as stubs if Claude didn't run
+      (i.e. when new_items.json was empty and the classify step was skipped).
+"""
+from __future__ import annotations
+
+import argparse
+import datetime
+import json
+import os
+import pathlib
+import sys
+from typing import Any
+
+from . import state
+
+
+RETENTION_DAYS = 180
+NEW_ITEMS_PATH = pathlib.Path("new_items.json")
+CLASSIFICATIONS_PATH = pathlib.Path("classifications.json")
+
+
+def _load_json(path: pathlib.Path, default: Any) -> Any:
+    if not path.exists():
+        return default
+    return json.loads(path.read_text())
+
+
+def _canonical(record: dict[str, Any], fallback_meta: dict[str, Any] | None) -> str:
+    if record.get("canonical_id"):
+        return record["canonical_id"]
+    cves = record.get("extracted_cves") or (fallback_meta or {}).get("extracted_cves") or []
+    if cves:
+        return cves[0]
+    return record["stable_id"]
+
+
+def _alt_ids(record: dict[str, Any], fallback_meta: dict[str, Any] | None) -> list[str]:
+    ids: list[str] = []
+    ids.append(record.get("stable_id", ""))
+    ids.extend(record.get("extracted_cves") or [])
+    if fallback_meta:
+        ids.extend(fallback_meta.get("extracted_cves") or [])
+        ids.extend(fallback_meta.get("vendor_ids") or [])
+        guid = fallback_meta.get("guid")
+        if guid:
+            ids.append(guid)
+        link = fallback_meta.get("permalink")
+        if link:
+            ids.append(link)
+    return [i for i in ids if i]
+
+
+def _unique(seq: list[str]) -> list[str]:
+    seen: set[str] = set()
+    out: list[str] = []
+    for x in seq:
+        if x and x not in seen:
+            seen.add(x)
+            out.append(x)
+    return out
+
+
+def merge(
+    data: dict[str, Any],
+    classifications: list[dict[str, Any]],
+    new_items_by_stable_id: dict[str, dict[str, Any]],
+    scan_date: str,
+) -> None:
+    for rec in classifications:
+        stable_id = rec.get("stable_id")
+        if not stable_id:
+            continue
+        meta = new_items_by_stable_id.get(stable_id, {})
+        canonical = _canonical(rec, meta)
+        bucket = rec.get("bucket", "unrelated")
+
+        title = (meta.get("title") or "").strip()
+
+        existing = data["seen"].get(canonical)
+        if existing is None:
+            data["seen"][canonical] = {
+                "bucket": bucket,
+                "first_seen": scan_date,
+                "seen_at": scan_date,
+                "title": title,
+                "sources": _unique(list(rec.get("sources") or []) + ([meta.get("source")] if meta.get("source") else [])),
+                "urls":    _unique(list(rec.get("urls") or []) + ([meta.get("permalink")] if meta.get("permalink") else [])),
+            }
+        else:
+            existing["bucket"] = state.promote_bucket(existing["bucket"], bucket)
+            existing["seen_at"] = scan_date
+            existing.setdefault("first_seen", existing.get("seen_at") or scan_date)
+            if not existing.get("title") and title:
+                existing["title"] = title
+            existing["sources"] = _unique(list(existing.get("sources") or []) + list(rec.get("sources") or []) + ([meta.get("source")] if meta.get("source") else []))
+            existing["urls"] = _unique(list(existing.get("urls") or []) + list(rec.get("urls") or []) + ([meta.get("permalink")] if meta.get("permalink") else []))
+
+        # Aliases: every alt id that is not the canonical key points at it.
+        for alt in _alt_ids(rec, meta):
+            if alt != canonical:
+                data["aliases"][alt] = canonical
+
+
+def ensure_stub_reports(scan_date: str) -> None:
+    """If the Claude step was skipped, write empty stub watch_*.md files so the
+    report artifact is consistent across runs."""
+    day = scan_date[:10]  # YYYY-MM-DD
+    stub = "(no new items in this window)\n"
+    for bucket in ("toimplement", "tocheck", "unrelated"):
+        p = pathlib.Path(f"watch_{day}_{bucket}.md")
+        if not p.exists():
+            p.write_text(stub)
+
+
+def write_snapshots(data: dict[str, Any], scan_date: str) -> None:
+    """Write current_toimplement.md and current_tocheck.md — full backlog
+    snapshots reflecting every entry in state under those buckets. A human
+    who reads only the latest run's artifact sees the complete picture
+    without having to consult prior runs."""
+    for bucket in ("toimplement", "tocheck"):
+        entries = [
+            (cid, rec) for cid, rec in data["seen"].items()
+            if rec.get("bucket") == bucket
+        ]
+        # Oldest first — long-lingering items stay at the top as a reminder.
+        entries.sort(key=lambda kv: kv[1].get("first_seen") or kv[1].get("seen_at") or "")
+        out = [
+            f"# Current `{bucket}` backlog",
+            "",
+            f"_Snapshot as of {scan_date}. "
+            f"{len(entries)} item(s). Oldest first._",
+            "",
+        ]
+        if not entries:
+            out.append("(backlog is empty)")
+        else:
+            for cid, rec in entries:
+                title = rec.get("title") or ""
+                first_seen = (rec.get("first_seen") or rec.get("seen_at") or "")[:10]
+                sources = ", ".join(rec.get("sources") or []) or "(none)"
+                out.append(f"- **{cid}**" + (f" — {title}" if title else ""))
+                out.append(f"  first seen {first_seen} · sources: {sources}")
+                for u in rec.get("urls") or []:
+                    out.append(f"  - {u}")
+                out.append("")
+        pathlib.Path(f"current_{bucket}.md").write_text("\n".join(out))
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--scan-date", default=os.environ.get("SCAN_DATE", ""))
+    ap.add_argument("--classifications", type=pathlib.Path, default=CLASSIFICATIONS_PATH)
+    ap.add_argument("--new-items", type=pathlib.Path, default=NEW_ITEMS_PATH)
+    args = ap.parse_args()
+
+    scan_date = args.scan_date or datetime.datetime.now(datetime.timezone.utc).isoformat()
+
+    data = state.load()
+    classifications = _load_json(args.classifications, [])
+    new_items_doc = _load_json(args.new_items, {"items": []})
+    new_items_by_stable_id = {it["stable_id"]: it for it in new_items_doc.get("items", []) if it.get("stable_id")}
+
+    if not isinstance(classifications, list):
+        print(f"warning: {args.classifications} is not a list; ignoring", file=sys.stderr)
+        classifications = []
+
+    merge(data, classifications, new_items_by_stable_id, scan_date)
+    data["last_run"] = scan_date
+
+    scan_now = datetime.datetime.fromisoformat(scan_date.replace("Z", "+00:00"))
+    before, after = state.prune(data, RETENTION_DAYS, scan_now)
+    state.save(data)
+    ensure_stub_reports(scan_date)
+    write_snapshots(data, scan_date)
+
+    print(f"Merged {len(classifications)} classifications.")
+    print(f"Pruned seen: {before} -> {after} entries (retention={RETENTION_DAYS}d).")
+    print(f"Aliases: {len(data['aliases'])}.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,59 @@
+"""Declarative list of sources polled by the daily vuln scan."""
+from dataclasses import dataclass
+from typing import Literal
+
+Kind = Literal["rss", "atom", "html"]
+
+
+@dataclass(frozen=True)
+class Source:
+    name: str
+    url: str
+    kind: Kind
+    # For HTML sources: regexes used to extract advisory IDs from the page.
+    advisory_id_patterns: tuple[str, ...] = ()
+    # Human-facing URL to use as permalink fallback when `url` points at a
+    # non-browsable endpoint (e.g. a JS data file). Empty = use `url`.
+    display_url: str = ""
+    # Per-source UA override. AMD's CDN drops connections when the UA string
+    # contains a parenthesized URL, while Intel/ARM's WAF rejects UAs that
+    # don't identify themselves — so we can't use one UA everywhere.
+    # Empty = use the module-level USER_AGENT.
+    user_agent: str = ""
+
+
+SOURCES: tuple[Source, ...] = (
+    Source("phoronix",       "https://www.phoronix.com/rss.php", "rss"),
+    Source("oss-sec",        "https://seclists.org/rss/oss-sec.rss", "rss"),
+    Source("lwn",            "https://lwn.net/headlines/newrss", "rss"),
+    Source("project-zero",   "https://googleprojectzero.blogspot.com/feeds/posts/default", "atom"),
+    Source("vusec",          "https://www.vusec.net/feed/", "rss"),
+    Source("comsec-eth",     "https://comsec.ethz.ch/category/news/feed/", "rss"),
+    # api.msrc.microsoft.com/update-guide/rss is the real RSS endpoint; the
+    # msrc.microsoft.com/... URL returns the SPA shell (2.7 KB) instead.
+    Source("msrc",           "https://api.msrc.microsoft.com/update-guide/rss", "rss"),
+    Source("cisa",           "https://www.cisa.gov/cybersecurity-advisories/all.xml", "rss"),
+    Source("cert-cc",        "https://www.kb.cert.org/vuls/atomfeed/", "atom"),
+    Source("intel-psirt",    "https://www.intel.com/content/www/us/en/security-center/default.html", "html",
+           (r"INTEL-SA-\d+",)),
+    Source("amd-psirt",      "https://www.amd.com/en/resources/product-security.html", "html",
+           (r"AMD-SB-\d+",),
+           user_agent="spectre-meltdown-checker/vuln-watch"),
+    Source("arm-spec",       "https://developer.arm.com/Arm%20Security%20Center/Speculative%20Processor%20Vulnerability", "html",
+           (r"CVE-\d{4}-\d{4,7}",)),
+    # transient.fail renders its attack table from tree.js client-side; we
+    # pull the JS file directly (CVE regex works on its JSON-ish body).
+    Source("transient-fail", "https://transient.fail/tree.js", "html",
+           (r"CVE-\d{4}-\d{4,7}",),
+           display_url="https://transient.fail/"),
+)
+
+# Identify ourselves honestly. Akamai/Cloudflare WAFs fronting intel.com,
+# developer.arm.com, and cisa.gov return 403 when the UA claims "Mozilla"
+# but TLS/HTTP fingerprint doesn't match a real browser — an honest bot UA
+# passes those rules cleanly.
+USER_AGENT = (
+    "spectre-meltdown-checker/vuln-watch "
+    "(+https://github.com/speed47/spectre-meltdown-checker)"
+)
+REQUEST_TIMEOUT = 30
@@ -0,0 +1,128 @@
+"""Load/save/migrate/lookup helpers for state/seen.json.
+
+Schema v2:
+    {
+      "schema_version": 2,
+      "last_run": "<iso8601>|null",
+      "sources": {
+        "<name>": {
+          "etag": "...",
+          "last_modified": "...",
+          "hwm_id": "...",
+          "hwm_published_at": "<iso8601>",
+          "last_fetched_at": "<iso8601>",
+          "last_status": 200|304|<http-err>|"<str-err>"
+        }
+      },
+      "seen": {
+        "<canonical_id>": {
+          "bucket": "toimplement|tocheck|unrelated",
+          "seen_at": "<iso8601>",
+          "sources": ["<source-name>", ...],
+          "urls":    ["<permalink>", ...]
+        }
+      },
+      "aliases": { "<alt_id>": "<canonical_id>" }
+    }
+"""
+from __future__ import annotations
+
+import datetime
+import json
+import pathlib
+from typing import Any
+
+
+STATE_PATH = pathlib.Path("state/seen.json")
+SCHEMA_VERSION = 2
+
+
+def empty() -> dict[str, Any]:
+    return {
+        "schema_version": SCHEMA_VERSION,
+        "last_run": None,
+        "sources": {},
+        "seen": {},
+        "aliases": {},
+    }
+
+
+def load(path: pathlib.Path = STATE_PATH) -> dict[str, Any]:
+    if not path.exists():
+        return empty()
+    data = json.loads(path.read_text())
+    return _migrate(data)
+
+
+def save(data: dict[str, Any], path: pathlib.Path = STATE_PATH) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(data, indent=2, sort_keys=True) + "\n")
+
+
+def _migrate(data: dict[str, Any]) -> dict[str, Any]:
+    """Bring any older schema up to SCHEMA_VERSION."""
+    version = data.get("schema_version")
+    if version == SCHEMA_VERSION:
+        data.setdefault("sources", {})
+        data.setdefault("aliases", {})
+        data.setdefault("seen", {})
+        return data
+
+    # v1 shape: {"last_run": ..., "seen": {<id>: {bucket, seen_at, source, cve?}}}
+    migrated_seen: dict[str, Any] = {}
+    aliases: dict[str, str] = {}
+    for key, entry in (data.get("seen") or {}).items():
+        rec = {
+            "bucket":  entry.get("bucket", "unrelated"),
+            "seen_at": entry.get("seen_at"),
+            "sources": [entry["source"]] if entry.get("source") else [],
+            "urls":    [key] if isinstance(key, str) and key.startswith("http") else [],
+        }
+        migrated_seen[key] = rec
+        # If a v1 entry had a CVE that differs from the key, alias the CVE -> key.
+        cve = entry.get("cve")
+        if cve and cve != key:
+            aliases[cve] = key
+
+    return {
+        "schema_version": SCHEMA_VERSION,
+        "last_run": data.get("last_run"),
+        "sources": {},
+        "seen": migrated_seen,
+        "aliases": aliases,
+    }
+
+
+def lookup(data: dict[str, Any], candidate_ids: list[str]) -> str | None:
+    """Return the canonical key if any candidate is already known, else None."""
+    seen = data["seen"]
+    aliases = data["aliases"]
+    for cid in candidate_ids:
+        if not cid:
+            continue
+        if cid in seen:
+            return cid
+        canonical = aliases.get(cid)
+        if canonical and canonical in seen:
+            return canonical
+    return None
+
+
+_BUCKET_STRENGTH = {"unrelated": 0, "tocheck": 1, "toimplement": 2}
+
+
+def promote_bucket(current: str, incoming: str) -> str:
+    """Return whichever of two buckets represents the 'stronger' classification."""
+    return incoming if _BUCKET_STRENGTH.get(incoming, 0) > _BUCKET_STRENGTH.get(current, 0) else current
+
+
+def prune(data: dict[str, Any], days: int, now: datetime.datetime) -> tuple[int, int]:
+    """Drop seen entries older than `days`, and aliases pointing at dropped keys."""
+    cutoff = (now - datetime.timedelta(days=days)).isoformat()
+    before = len(data["seen"])
+    data["seen"] = {
+        k: v for k, v in data["seen"].items()
+        if (v.get("seen_at") or "9999") >= cutoff
+    }
+    data["aliases"] = {k: v for k, v in data["aliases"].items() if v in data["seen"]}
+    return before, len(data["seen"])