throttle reconsider pass by last-review age (default 7 days)

This commit is contained in:
Stéphane Lesimple
2026-04-19 12:17:31 +00:00
parent 31cf549c75
commit 7a3224ad61

View File

@@ -31,6 +31,7 @@ from . import state
CVE_RE = re.compile(r"CVE-\d{4}-\d{4,7}") CVE_RE = re.compile(r"CVE-\d{4}-\d{4,7}")
DEFAULT_WINDOW_HOURS = 25 DEFAULT_WINDOW_HOURS = 25
DEFAULT_RECONSIDER_AGE_DAYS = 7
MAX_ITEMS_PER_FEED = 200 MAX_ITEMS_PER_FEED = 200
SNIPPET_MAX = 400 SNIPPET_MAX = 400
NEW_ITEMS_PATH = pathlib.Path("new_items.json") NEW_ITEMS_PATH = pathlib.Path("new_items.json")
@@ -362,28 +363,60 @@ def _resolve_window_hours() -> float:
return float(DEFAULT_WINDOW_HOURS) return float(DEFAULT_WINDOW_HOURS)
def backlog_to_reconsider(data: dict[str, Any]) -> list[dict[str, Any]]: def _resolve_reconsider_age_days() -> float:
"""Pick up RECONSIDER_AGE_DAYS from the environment. Entries whose last
review (reconsidered_at, or first_seen if never reconsidered) is more
recent than this many days ago are skipped. 0 = reconsider everything
every run (no throttle)."""
raw = os.environ.get("RECONSIDER_AGE_DAYS", "").strip()
if not raw:
return float(DEFAULT_RECONSIDER_AGE_DAYS)
try:
v = float(raw)
if v < 0:
raise ValueError("must be >= 0")
return v
except ValueError:
print(f"warning: ignoring invalid RECONSIDER_AGE_DAYS={raw!r}, "
f"using {DEFAULT_RECONSIDER_AGE_DAYS}", file=sys.stderr)
return float(DEFAULT_RECONSIDER_AGE_DAYS)
def backlog_to_reconsider(
data: dict[str, Any],
scan_now: datetime.datetime,
min_age_days: float = DEFAULT_RECONSIDER_AGE_DAYS,
) -> list[dict[str, Any]]:
"""Walk state.seen and emit toimplement/tocheck entries for re-review. """Walk state.seen and emit toimplement/tocheck entries for re-review.
Each entry carries enough context that Claude can re-grep ./checker/ Throttle: skip entries whose "last review" timestamp is more recent
and decide whether the prior classification still holds. Items in than `min_age_days` ago. "Last review" is `reconsidered_at` if Claude
`unrelated` are skipped — those are settled. has already reconsidered the entry at least once, otherwise
`first_seen` (the initial classification was itself a review). With
`min_age_days=0` the throttle is disabled — every qualifying entry
is emitted on every run.
Items in `unrelated` are never emitted — those are settled.
A CVE alias pointing at this canonical is included in `extracted_cves` A CVE alias pointing at this canonical is included in `extracted_cves`
so Claude sees every known CVE for the item without having to consult so Claude sees every known CVE for the item without having to consult
the full alias map. the full alias map.
""" """
seen = data.get("seen", {}) seen = data.get("seen", {})
aliases = data.get("aliases", {}) aliases = data.get("aliases", {})
# Reverse-index aliases: canonical -> [alt, ...]
by_canonical: dict[str, list[str]] = {} by_canonical: dict[str, list[str]] = {}
for alt, canon in aliases.items(): for alt, canon in aliases.items():
by_canonical.setdefault(canon, []).append(alt) by_canonical.setdefault(canon, []).append(alt)
# Any entry whose last review is newer than this ISO cutoff is throttled.
cutoff = (scan_now - datetime.timedelta(days=min_age_days)).isoformat()
out: list[dict[str, Any]] = [] out: list[dict[str, Any]] = []
for canonical, rec in seen.items(): for canonical, rec in seen.items():
if rec.get("bucket") not in ("toimplement", "tocheck"): if rec.get("bucket") not in ("toimplement", "tocheck"):
continue continue
last_reviewed = rec.get("reconsidered_at") or rec.get("first_seen") or ""
if min_age_days > 0 and last_reviewed and last_reviewed > cutoff:
continue
cves: list[str] = [] cves: list[str] = []
if canonical.startswith("CVE-"): if canonical.startswith("CVE-"):
cves.append(canonical) cves.append(canonical)
@@ -398,6 +431,7 @@ def backlog_to_reconsider(data: dict[str, Any]) -> list[dict[str, Any]]:
"urls": list(rec.get("urls") or []), "urls": list(rec.get("urls") or []),
"extracted_cves": cves, "extracted_cves": cves,
"first_seen": rec.get("first_seen"), "first_seen": rec.get("first_seen"),
"reconsidered_at": rec.get("reconsidered_at"),
}) })
return out return out
@@ -428,6 +462,7 @@ def main() -> int:
scan_now = now_from_scan_date(args.scan_date) scan_now = now_from_scan_date(args.scan_date)
scan_date_iso = scan_now.isoformat() scan_date_iso = scan_now.isoformat()
window_hours = _resolve_window_hours() window_hours = _resolve_window_hours()
reconsider_age_days = _resolve_reconsider_age_days()
data = state.load() data = state.load()
cutoff = compute_cutoff(scan_now, data.get("last_run"), window_hours) cutoff = compute_cutoff(scan_now, data.get("last_run"), window_hours)
@@ -491,7 +526,7 @@ def main() -> int:
# Persist updated HTTP cache metadata regardless of whether Claude runs. # Persist updated HTTP cache metadata regardless of whether Claude runs.
state.save(data) state.save(data)
reconsider = backlog_to_reconsider(data) reconsider = backlog_to_reconsider(data, scan_now, reconsider_age_days)
out = { out = {
"scan_date": scan_date_iso, "scan_date": scan_date_iso,
@@ -520,7 +555,11 @@ def main() -> int:
print(f"Window: {window_hours:g} h") print(f"Window: {window_hours:g} h")
print(f"Cutoff: {cutoff.isoformat()}") print(f"Cutoff: {cutoff.isoformat()}")
print(f"New items: {len(all_new)}") print(f"New items: {len(all_new)}")
print(f"Reconsider: {len(reconsider)} existing toimplement/tocheck entries") if reconsider_age_days == 0:
print(f"Reconsider: {len(reconsider)} (throttle disabled)")
else:
print(f"Reconsider: {len(reconsider)} (throttle: "
f"skip entries reviewed <{reconsider_age_days:g}d ago)")
for s, v in per_source.items(): for s, v in per_source.items():
print(f" {s:14s} status={str(v['status']):>16} new={v['new']}") print(f" {s:14s} status={str(v['status']):>16} new={v['new']}")