mirror of
https://github.com/speed47/spectre-meltdown-checker.git
synced 2026-04-27 02:53:24 +02:00
reconsider prior backlog each run + recognize CVEs from context
This commit is contained in:
@@ -362,6 +362,46 @@ def _resolve_window_hours() -> float:
|
||||
return float(DEFAULT_WINDOW_HOURS)
|
||||
|
||||
|
||||
def backlog_to_reconsider(data: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
"""Walk state.seen and emit toimplement/tocheck entries for re-review.
|
||||
|
||||
Each entry carries enough context that Claude can re-grep ./checker/
|
||||
and decide whether the prior classification still holds. Items in
|
||||
`unrelated` are skipped — those are settled.
|
||||
|
||||
A CVE alias pointing at this canonical is included in `extracted_cves`
|
||||
so Claude sees every known CVE for the item without having to consult
|
||||
the full alias map.
|
||||
"""
|
||||
seen = data.get("seen", {})
|
||||
aliases = data.get("aliases", {})
|
||||
# Reverse-index aliases: canonical -> [alt, ...]
|
||||
by_canonical: dict[str, list[str]] = {}
|
||||
for alt, canon in aliases.items():
|
||||
by_canonical.setdefault(canon, []).append(alt)
|
||||
|
||||
out: list[dict[str, Any]] = []
|
||||
for canonical, rec in seen.items():
|
||||
if rec.get("bucket") not in ("toimplement", "tocheck"):
|
||||
continue
|
||||
cves: list[str] = []
|
||||
if canonical.startswith("CVE-"):
|
||||
cves.append(canonical)
|
||||
for alt in by_canonical.get(canonical, []):
|
||||
if alt.startswith("CVE-") and alt not in cves:
|
||||
cves.append(alt)
|
||||
out.append({
|
||||
"canonical_id": canonical,
|
||||
"current_bucket": rec.get("bucket"),
|
||||
"title": rec.get("title") or "",
|
||||
"sources": list(rec.get("sources") or []),
|
||||
"urls": list(rec.get("urls") or []),
|
||||
"extracted_cves": cves,
|
||||
"first_seen": rec.get("first_seen"),
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
def candidate_ids(item: dict[str, Any]) -> list[str]:
|
||||
"""All identifiers under which this item might already be known."""
|
||||
seen: set[str] = set()
|
||||
@@ -451,19 +491,25 @@ def main() -> int:
|
||||
# Persist updated HTTP cache metadata regardless of whether Claude runs.
|
||||
state.save(data)
|
||||
|
||||
reconsider = backlog_to_reconsider(data)
|
||||
|
||||
out = {
|
||||
"scan_date": scan_date_iso,
|
||||
"window_cutoff": cutoff.isoformat(),
|
||||
"per_source": per_source,
|
||||
"items": all_new,
|
||||
"reconsider": reconsider,
|
||||
}
|
||||
args.output.write_text(json.dumps(out, indent=2, sort_keys=True) + "\n")
|
||||
|
||||
# GitHub Actions step outputs
|
||||
# GitHub Actions step outputs. Downstream `if:` conditions gate the
|
||||
# classify step on `new_count || reconsider_count`; both must be 0
|
||||
# for Claude to be skipped.
|
||||
gh_out = os.environ.get("GITHUB_OUTPUT")
|
||||
if gh_out:
|
||||
with open(gh_out, "a") as f:
|
||||
f.write(f"new_count={len(all_new)}\n")
|
||||
f.write(f"reconsider_count={len(reconsider)}\n")
|
||||
failures = [
|
||||
s for s, v in per_source.items()
|
||||
if not (isinstance(v["status"], int) and v["status"] in (200, 304))
|
||||
@@ -474,6 +520,7 @@ def main() -> int:
|
||||
print(f"Window: {window_hours:g} h")
|
||||
print(f"Cutoff: {cutoff.isoformat()}")
|
||||
print(f"New items: {len(all_new)}")
|
||||
print(f"Reconsider: {len(reconsider)} existing toimplement/tocheck entries")
|
||||
for s, v in per_source.items():
|
||||
print(f" {s:14s} status={str(v['status']):>16} new={v['new']}")
|
||||
|
||||
|
||||
@@ -14,11 +14,22 @@ Each classification record has shape:
|
||||
"bucket": "toimplement|tocheck|unrelated",
|
||||
"extracted_cves": ["...", ...], # optional
|
||||
"sources": ["...", ...], # optional
|
||||
"urls": ["...", ...] # optional
|
||||
"urls": ["...", ...], # optional
|
||||
"reconsider": true # optional; set by Claude for reconsidered
|
||||
# backlog entries — merge overwrites
|
||||
# the stored bucket (incl. demotions)
|
||||
# instead of promoting
|
||||
}
|
||||
|
||||
Behavior:
|
||||
- Upsert seen[canonical_id], union sources/urls, promote bucket strength.
|
||||
- For records WITHOUT `reconsider: true` (fresh items):
|
||||
upsert seen[canonical_id], union sources/urls, promote bucket strength.
|
||||
- For records WITH `reconsider: true` (previously-classified entries):
|
||||
overwrite the stored bucket unconditionally (permits demotions), union
|
||||
sources/urls. If Claude's canonical_id differs from the stable_id (the
|
||||
previous canonical), rekey the seen entry under the new ID and leave
|
||||
the old as an alias — used when a CVE has since been assigned to what
|
||||
was previously a bare vendor-ID entry.
|
||||
- For every alt_id in (stable_id, vendor_ids, extracted_cves) that differs
|
||||
from canonical_id, set aliases[alt_id] = canonical_id.
|
||||
- Update last_run to SCAN_DATE.
|
||||
@@ -92,38 +103,117 @@ def merge(
|
||||
scan_date: str,
|
||||
) -> None:
|
||||
for rec in classifications:
|
||||
stable_id = rec.get("stable_id")
|
||||
if not stable_id:
|
||||
if not rec.get("stable_id"):
|
||||
continue
|
||||
meta = new_items_by_stable_id.get(stable_id, {})
|
||||
canonical = _canonical(rec, meta)
|
||||
bucket = rec.get("bucket", "unrelated")
|
||||
|
||||
title = (meta.get("title") or "").strip()
|
||||
|
||||
existing = data["seen"].get(canonical)
|
||||
if existing is None:
|
||||
data["seen"][canonical] = {
|
||||
"bucket": bucket,
|
||||
"first_seen": scan_date,
|
||||
"seen_at": scan_date,
|
||||
"title": title,
|
||||
"sources": _unique(list(rec.get("sources") or []) + ([meta.get("source")] if meta.get("source") else [])),
|
||||
"urls": _unique(list(rec.get("urls") or []) + ([meta.get("permalink")] if meta.get("permalink") else [])),
|
||||
}
|
||||
if rec.get("reconsider"):
|
||||
_apply_reconsider(data, rec, scan_date)
|
||||
else:
|
||||
existing["bucket"] = state.promote_bucket(existing["bucket"], bucket)
|
||||
existing["seen_at"] = scan_date
|
||||
existing.setdefault("first_seen", existing.get("seen_at") or scan_date)
|
||||
if not existing.get("title") and title:
|
||||
existing["title"] = title
|
||||
existing["sources"] = _unique(list(existing.get("sources") or []) + list(rec.get("sources") or []) + ([meta.get("source")] if meta.get("source") else []))
|
||||
existing["urls"] = _unique(list(existing.get("urls") or []) + list(rec.get("urls") or []) + ([meta.get("permalink")] if meta.get("permalink") else []))
|
||||
_apply_new_item(data, rec, new_items_by_stable_id, scan_date)
|
||||
|
||||
# Aliases: every alt id that is not the canonical key points at it.
|
||||
for alt in _alt_ids(rec, meta):
|
||||
if alt != canonical:
|
||||
data["aliases"][alt] = canonical
|
||||
|
||||
def _apply_new_item(
|
||||
data: dict[str, Any],
|
||||
rec: dict[str, Any],
|
||||
new_items_by_stable_id: dict[str, dict[str, Any]],
|
||||
scan_date: str,
|
||||
) -> None:
|
||||
stable_id = rec["stable_id"]
|
||||
meta = new_items_by_stable_id.get(stable_id, {})
|
||||
canonical = _canonical(rec, meta)
|
||||
bucket = rec.get("bucket", "unrelated")
|
||||
title = (meta.get("title") or "").strip()
|
||||
|
||||
existing = data["seen"].get(canonical)
|
||||
if existing is None:
|
||||
data["seen"][canonical] = {
|
||||
"bucket": bucket,
|
||||
"first_seen": scan_date,
|
||||
"seen_at": scan_date,
|
||||
"title": title,
|
||||
"sources": _unique(list(rec.get("sources") or []) + ([meta.get("source")] if meta.get("source") else [])),
|
||||
"urls": _unique(list(rec.get("urls") or []) + ([meta.get("permalink")] if meta.get("permalink") else [])),
|
||||
}
|
||||
else:
|
||||
existing["bucket"] = state.promote_bucket(existing["bucket"], bucket)
|
||||
existing["seen_at"] = scan_date
|
||||
existing.setdefault("first_seen", existing.get("seen_at") or scan_date)
|
||||
if not existing.get("title") and title:
|
||||
existing["title"] = title
|
||||
existing["sources"] = _unique(list(existing.get("sources") or []) + list(rec.get("sources") or []) + ([meta.get("source")] if meta.get("source") else []))
|
||||
existing["urls"] = _unique(list(existing.get("urls") or []) + list(rec.get("urls") or []) + ([meta.get("permalink")] if meta.get("permalink") else []))
|
||||
|
||||
for alt in _alt_ids(rec, meta):
|
||||
if alt != canonical:
|
||||
data["aliases"][alt] = canonical
|
||||
|
||||
|
||||
def _apply_reconsider(
|
||||
data: dict[str, Any],
|
||||
rec: dict[str, Any],
|
||||
scan_date: str,
|
||||
) -> None:
|
||||
"""Re-review of a previously-classified entry. The record's stable_id
|
||||
is the entry's current canonical key in state; `canonical_id` may name
|
||||
a new key (e.g. a freshly-assigned CVE) — in which case we rekey."""
|
||||
old_key = rec["stable_id"]
|
||||
new_canonical = _canonical(rec, None)
|
||||
bucket = rec.get("bucket", "unrelated")
|
||||
|
||||
# Resolve the current record — may need to follow an alias if the
|
||||
# backlog snapshot the classifier reviewed is slightly out of sync.
|
||||
current_key = old_key if old_key in data["seen"] else data["aliases"].get(old_key)
|
||||
if not current_key or current_key not in data["seen"]:
|
||||
print(f"warning: reconsider record for {old_key!r} points at no "
|
||||
f"state entry; skipping.", file=sys.stderr)
|
||||
return
|
||||
|
||||
existing = data["seen"][current_key]
|
||||
|
||||
# Overwrite bucket unconditionally (allows demotions) and stamp the
|
||||
# reconsideration date so we can later throttle if this grows.
|
||||
existing["bucket"] = bucket
|
||||
existing["seen_at"] = scan_date
|
||||
existing["reconsidered_at"] = scan_date
|
||||
|
||||
# Union any fresh sources/urls the classifier surfaced.
|
||||
if rec.get("sources"):
|
||||
existing["sources"] = _unique(list(existing.get("sources") or []) + list(rec["sources"]))
|
||||
if rec.get("urls"):
|
||||
existing["urls"] = _unique(list(existing.get("urls") or []) + list(rec["urls"]))
|
||||
|
||||
# Alias every alt ID the classifier provided to the current key
|
||||
# (before a possible rekey below redirects them).
|
||||
for alt in _alt_ids(rec, None):
|
||||
if alt != current_key:
|
||||
data["aliases"][alt] = current_key
|
||||
|
||||
# Rekey if Claude newly identified a canonical ID (e.g., a CVE for a
|
||||
# vendor-ID entry). If the destination already exists, merge; else
|
||||
# move. In both cases, retarget all aliases and leave the old key
|
||||
# itself as an alias.
|
||||
if new_canonical and new_canonical != current_key:
|
||||
if new_canonical in data["seen"]:
|
||||
dest = data["seen"][new_canonical]
|
||||
dest["bucket"] = state.promote_bucket(dest.get("bucket", "unrelated"), existing.get("bucket", "unrelated"))
|
||||
dest["sources"] = _unique(list(dest.get("sources") or []) + list(existing.get("sources") or []))
|
||||
dest["urls"] = _unique(list(dest.get("urls") or []) + list(existing.get("urls") or []))
|
||||
if not dest.get("title") and existing.get("title"):
|
||||
dest["title"] = existing["title"]
|
||||
dest["seen_at"] = scan_date
|
||||
dest["reconsidered_at"] = scan_date
|
||||
dest.setdefault("first_seen", existing.get("first_seen") or scan_date)
|
||||
del data["seen"][current_key]
|
||||
else:
|
||||
data["seen"][new_canonical] = existing
|
||||
del data["seen"][current_key]
|
||||
|
||||
for alias_key, target in list(data["aliases"].items()):
|
||||
if target == current_key:
|
||||
data["aliases"][alias_key] = new_canonical
|
||||
data["aliases"][current_key] = new_canonical
|
||||
# Clean up any self-aliases the retarget may have produced.
|
||||
for k in [k for k, v in data["aliases"].items() if k == v]:
|
||||
del data["aliases"][k]
|
||||
|
||||
|
||||
def ensure_stub_reports(scan_date: str) -> None:
|
||||
|
||||
Reference in New Issue
Block a user