reconsider prior backlog each run + recognize CVEs from context

This commit is contained in:
Stéphane Lesimple
2026-04-19 10:41:52 +00:00
parent 12f545dc45
commit b305cc48c3
3 changed files with 234 additions and 41 deletions

View File

@@ -14,11 +14,22 @@ Each classification record has shape:
"bucket": "toimplement|tocheck|unrelated",
"extracted_cves": ["...", ...], # optional
"sources": ["...", ...], # optional
"urls": ["...", ...] # optional
"urls": ["...", ...], # optional
"reconsider": true # optional; set by Claude for reconsidered
# backlog entries — merge overwrites
# the stored bucket (incl. demotions)
# instead of promoting
}
Behavior:
- Upsert seen[canonical_id], union sources/urls, promote bucket strength.
- For records WITHOUT `reconsider: true` (fresh items):
upsert seen[canonical_id], union sources/urls, promote bucket strength.
- For records WITH `reconsider: true` (previously-classified entries):
overwrite the stored bucket unconditionally (permits demotions), union
sources/urls. If Claude's canonical_id differs from the stable_id (the
previous canonical), rekey the seen entry under the new ID and leave
the old as an alias — used when a CVE has since been assigned to what
was previously a bare vendor-ID entry.
- For every alt_id in (stable_id, vendor_ids, extracted_cves) that differs
from canonical_id, set aliases[alt_id] = canonical_id.
- Update last_run to SCAN_DATE.
@@ -92,38 +103,117 @@ def merge(
scan_date: str,
) -> None:
for rec in classifications:
stable_id = rec.get("stable_id")
if not stable_id:
if not rec.get("stable_id"):
continue
meta = new_items_by_stable_id.get(stable_id, {})
canonical = _canonical(rec, meta)
bucket = rec.get("bucket", "unrelated")
title = (meta.get("title") or "").strip()
existing = data["seen"].get(canonical)
if existing is None:
data["seen"][canonical] = {
"bucket": bucket,
"first_seen": scan_date,
"seen_at": scan_date,
"title": title,
"sources": _unique(list(rec.get("sources") or []) + ([meta.get("source")] if meta.get("source") else [])),
"urls": _unique(list(rec.get("urls") or []) + ([meta.get("permalink")] if meta.get("permalink") else [])),
}
if rec.get("reconsider"):
_apply_reconsider(data, rec, scan_date)
else:
existing["bucket"] = state.promote_bucket(existing["bucket"], bucket)
existing["seen_at"] = scan_date
existing.setdefault("first_seen", existing.get("seen_at") or scan_date)
if not existing.get("title") and title:
existing["title"] = title
existing["sources"] = _unique(list(existing.get("sources") or []) + list(rec.get("sources") or []) + ([meta.get("source")] if meta.get("source") else []))
existing["urls"] = _unique(list(existing.get("urls") or []) + list(rec.get("urls") or []) + ([meta.get("permalink")] if meta.get("permalink") else []))
_apply_new_item(data, rec, new_items_by_stable_id, scan_date)
# Aliases: every alt id that is not the canonical key points at it.
for alt in _alt_ids(rec, meta):
if alt != canonical:
data["aliases"][alt] = canonical
def _apply_new_item(
data: dict[str, Any],
rec: dict[str, Any],
new_items_by_stable_id: dict[str, dict[str, Any]],
scan_date: str,
) -> None:
stable_id = rec["stable_id"]
meta = new_items_by_stable_id.get(stable_id, {})
canonical = _canonical(rec, meta)
bucket = rec.get("bucket", "unrelated")
title = (meta.get("title") or "").strip()
existing = data["seen"].get(canonical)
if existing is None:
data["seen"][canonical] = {
"bucket": bucket,
"first_seen": scan_date,
"seen_at": scan_date,
"title": title,
"sources": _unique(list(rec.get("sources") or []) + ([meta.get("source")] if meta.get("source") else [])),
"urls": _unique(list(rec.get("urls") or []) + ([meta.get("permalink")] if meta.get("permalink") else [])),
}
else:
existing["bucket"] = state.promote_bucket(existing["bucket"], bucket)
existing["seen_at"] = scan_date
existing.setdefault("first_seen", existing.get("seen_at") or scan_date)
if not existing.get("title") and title:
existing["title"] = title
existing["sources"] = _unique(list(existing.get("sources") or []) + list(rec.get("sources") or []) + ([meta.get("source")] if meta.get("source") else []))
existing["urls"] = _unique(list(existing.get("urls") or []) + list(rec.get("urls") or []) + ([meta.get("permalink")] if meta.get("permalink") else []))
for alt in _alt_ids(rec, meta):
if alt != canonical:
data["aliases"][alt] = canonical
def _apply_reconsider(
data: dict[str, Any],
rec: dict[str, Any],
scan_date: str,
) -> None:
"""Re-review of a previously-classified entry. The record's stable_id
is the entry's current canonical key in state; `canonical_id` may name
a new key (e.g. a freshly-assigned CVE) — in which case we rekey."""
old_key = rec["stable_id"]
new_canonical = _canonical(rec, None)
bucket = rec.get("bucket", "unrelated")
# Resolve the current record — may need to follow an alias if the
# backlog snapshot the classifier reviewed is slightly out of sync.
current_key = old_key if old_key in data["seen"] else data["aliases"].get(old_key)
if not current_key or current_key not in data["seen"]:
print(f"warning: reconsider record for {old_key!r} points at no "
f"state entry; skipping.", file=sys.stderr)
return
existing = data["seen"][current_key]
# Overwrite bucket unconditionally (allows demotions) and stamp the
# reconsideration date so we can later throttle if this grows.
existing["bucket"] = bucket
existing["seen_at"] = scan_date
existing["reconsidered_at"] = scan_date
# Union any fresh sources/urls the classifier surfaced.
if rec.get("sources"):
existing["sources"] = _unique(list(existing.get("sources") or []) + list(rec["sources"]))
if rec.get("urls"):
existing["urls"] = _unique(list(existing.get("urls") or []) + list(rec["urls"]))
# Alias every alt ID the classifier provided to the current key
# (before a possible rekey below redirects them).
for alt in _alt_ids(rec, None):
if alt != current_key:
data["aliases"][alt] = current_key
# Rekey if Claude newly identified a canonical ID (e.g., a CVE for a
# vendor-ID entry). If the destination already exists, merge; else
# move. In both cases, retarget all aliases and leave the old key
# itself as an alias.
if new_canonical and new_canonical != current_key:
if new_canonical in data["seen"]:
dest = data["seen"][new_canonical]
dest["bucket"] = state.promote_bucket(dest.get("bucket", "unrelated"), existing.get("bucket", "unrelated"))
dest["sources"] = _unique(list(dest.get("sources") or []) + list(existing.get("sources") or []))
dest["urls"] = _unique(list(dest.get("urls") or []) + list(existing.get("urls") or []))
if not dest.get("title") and existing.get("title"):
dest["title"] = existing["title"]
dest["seen_at"] = scan_date
dest["reconsidered_at"] = scan_date
dest.setdefault("first_seen", existing.get("first_seen") or scan_date)
del data["seen"][current_key]
else:
data["seen"][new_canonical] = existing
del data["seen"][current_key]
for alias_key, target in list(data["aliases"].items()):
if target == current_key:
data["aliases"][alias_key] = new_canonical
data["aliases"][current_key] = new_canonical
# Clean up any self-aliases the retarget may have produced.
for k in [k for k, v in data["aliases"].items() if k == v]:
del data["aliases"][k]
def ensure_stub_reports(scan_date: str) -> None: