reconsider prior backlog each run + recognize CVEs from context

2026-04-25 01:53:25 +02:00 · 2026-04-19 10:41:52 +00:00
parent 12f545dc45
commit b305cc48c3
3 changed files with 234 additions and 41 deletions
--- a/scripts/vuln_watch/merge_state.py
+++ b/scripts/vuln_watch/merge_state.py
@@ -14,11 +14,22 @@ Each classification record has shape:
      "bucket":         "toimplement|tocheck|unrelated",
      "extracted_cves": ["...", ...],    # optional
      "sources":        ["...", ...],    # optional
-      "urls":           ["...", ...]     # optional
+      "urls":           ["...", ...],    # optional
+      "reconsider":     true             # optional; set by Claude for reconsidered
+                                         #   backlog entries — merge overwrites
+                                         #   the stored bucket (incl. demotions)
+                                         #   instead of promoting
    }

 Behavior:
-    - Upsert seen[canonical_id], union sources/urls, promote bucket strength.
+    - For records WITHOUT `reconsider: true` (fresh items):
+      upsert seen[canonical_id], union sources/urls, promote bucket strength.
+    - For records WITH `reconsider: true` (previously-classified entries):
+      overwrite the stored bucket unconditionally (permits demotions), union
+      sources/urls. If Claude's canonical_id differs from the stable_id (the
+      previous canonical), rekey the seen entry under the new ID and leave
+      the old as an alias — used when a CVE has since been assigned to what
+      was previously a bare vendor-ID entry.
    - For every alt_id in (stable_id, vendor_ids, extracted_cves) that differs
      from canonical_id, set aliases[alt_id] = canonical_id.
    - Update last_run to SCAN_DATE.
@@ -92,38 +103,117 @@ def merge(
    scan_date: str,
 ) -> None:
    for rec in classifications:
-        stable_id = rec.get("stable_id")
-        if not stable_id:
+        if not rec.get("stable_id"):
            continue
-        meta = new_items_by_stable_id.get(stable_id, {})
-        canonical = _canonical(rec, meta)
-        bucket = rec.get("bucket", "unrelated")
-
-        title = (meta.get("title") or "").strip()
-
-        existing = data["seen"].get(canonical)
-        if existing is None:
-            data["seen"][canonical] = {
-                "bucket": bucket,
-                "first_seen": scan_date,
-                "seen_at": scan_date,
-                "title": title,
-                "sources": _unique(list(rec.get("sources") or []) + ([meta.get("source")] if meta.get("source") else [])),
-                "urls":    _unique(list(rec.get("urls") or []) + ([meta.get("permalink")] if meta.get("permalink") else [])),
-            }
+        if rec.get("reconsider"):
+            _apply_reconsider(data, rec, scan_date)
        else:
-            existing["bucket"] = state.promote_bucket(existing["bucket"], bucket)
-            existing["seen_at"] = scan_date
-            existing.setdefault("first_seen", existing.get("seen_at") or scan_date)
-            if not existing.get("title") and title:
-                existing["title"] = title
-            existing["sources"] = _unique(list(existing.get("sources") or []) + list(rec.get("sources") or []) + ([meta.get("source")] if meta.get("source") else []))
-            existing["urls"] = _unique(list(existing.get("urls") or []) + list(rec.get("urls") or []) + ([meta.get("permalink")] if meta.get("permalink") else []))
+            _apply_new_item(data, rec, new_items_by_stable_id, scan_date)

-        # Aliases: every alt id that is not the canonical key points at it.
-        for alt in _alt_ids(rec, meta):
-            if alt != canonical:
-                data["aliases"][alt] = canonical
+
+def _apply_new_item(
+    data: dict[str, Any],
+    rec: dict[str, Any],
+    new_items_by_stable_id: dict[str, dict[str, Any]],
+    scan_date: str,
+) -> None:
+    stable_id = rec["stable_id"]
+    meta = new_items_by_stable_id.get(stable_id, {})
+    canonical = _canonical(rec, meta)
+    bucket = rec.get("bucket", "unrelated")
+    title = (meta.get("title") or "").strip()
+
+    existing = data["seen"].get(canonical)
+    if existing is None:
+        data["seen"][canonical] = {
+            "bucket": bucket,
+            "first_seen": scan_date,
+            "seen_at": scan_date,
+            "title": title,
+            "sources": _unique(list(rec.get("sources") or []) + ([meta.get("source")] if meta.get("source") else [])),
+            "urls":    _unique(list(rec.get("urls") or []) + ([meta.get("permalink")] if meta.get("permalink") else [])),
+        }
+    else:
+        existing["bucket"] = state.promote_bucket(existing["bucket"], bucket)
+        existing["seen_at"] = scan_date
+        existing.setdefault("first_seen", existing.get("seen_at") or scan_date)
+        if not existing.get("title") and title:
+            existing["title"] = title
+        existing["sources"] = _unique(list(existing.get("sources") or []) + list(rec.get("sources") or []) + ([meta.get("source")] if meta.get("source") else []))
+        existing["urls"] = _unique(list(existing.get("urls") or []) + list(rec.get("urls") or []) + ([meta.get("permalink")] if meta.get("permalink") else []))
+
+    for alt in _alt_ids(rec, meta):
+        if alt != canonical:
+            data["aliases"][alt] = canonical
+
+
+def _apply_reconsider(
+    data: dict[str, Any],
+    rec: dict[str, Any],
+    scan_date: str,
+) -> None:
+    """Re-review of a previously-classified entry. The record's stable_id
+    is the entry's current canonical key in state; `canonical_id` may name
+    a new key (e.g. a freshly-assigned CVE) — in which case we rekey."""
+    old_key = rec["stable_id"]
+    new_canonical = _canonical(rec, None)
+    bucket = rec.get("bucket", "unrelated")
+
+    # Resolve the current record — may need to follow an alias if the
+    # backlog snapshot the classifier reviewed is slightly out of sync.
+    current_key = old_key if old_key in data["seen"] else data["aliases"].get(old_key)
+    if not current_key or current_key not in data["seen"]:
+        print(f"warning: reconsider record for {old_key!r} points at no "
+              f"state entry; skipping.", file=sys.stderr)
+        return
+
+    existing = data["seen"][current_key]
+
+    # Overwrite bucket unconditionally (allows demotions) and stamp the
+    # reconsideration date so we can later throttle if this grows.
+    existing["bucket"] = bucket
+    existing["seen_at"] = scan_date
+    existing["reconsidered_at"] = scan_date
+
+    # Union any fresh sources/urls the classifier surfaced.
+    if rec.get("sources"):
+        existing["sources"] = _unique(list(existing.get("sources") or []) + list(rec["sources"]))
+    if rec.get("urls"):
+        existing["urls"] = _unique(list(existing.get("urls") or []) + list(rec["urls"]))
+
+    # Alias every alt ID the classifier provided to the current key
+    # (before a possible rekey below redirects them).
+    for alt in _alt_ids(rec, None):
+        if alt != current_key:
+            data["aliases"][alt] = current_key
+
+    # Rekey if Claude newly identified a canonical ID (e.g., a CVE for a
+    # vendor-ID entry). If the destination already exists, merge; else
+    # move. In both cases, retarget all aliases and leave the old key
+    # itself as an alias.
+    if new_canonical and new_canonical != current_key:
+        if new_canonical in data["seen"]:
+            dest = data["seen"][new_canonical]
+            dest["bucket"] = state.promote_bucket(dest.get("bucket", "unrelated"), existing.get("bucket", "unrelated"))
+            dest["sources"] = _unique(list(dest.get("sources") or []) + list(existing.get("sources") or []))
+            dest["urls"] = _unique(list(dest.get("urls") or []) + list(existing.get("urls") or []))
+            if not dest.get("title") and existing.get("title"):
+                dest["title"] = existing["title"]
+            dest["seen_at"] = scan_date
+            dest["reconsidered_at"] = scan_date
+            dest.setdefault("first_seen", existing.get("first_seen") or scan_date)
+            del data["seen"][current_key]
+        else:
+            data["seen"][new_canonical] = existing
+            del data["seen"][current_key]
+
+        for alias_key, target in list(data["aliases"].items()):
+            if target == current_key:
+                data["aliases"][alias_key] = new_canonical
+        data["aliases"][current_key] = new_canonical
+        # Clean up any self-aliases the retarget may have produced.
+        for k in [k for k, v in data["aliases"].items() if k == v]:
+            del data["aliases"][k]


 def ensure_stub_reports(scan_date: str) -> None: