Files
spectre-meltdown-checker/scripts/vuln_watch/fetch_and_diff.py
2026-04-19 13:04:39 +02:00

532 lines
18 KiB
Python

#!/usr/bin/env python3
"""Fetch all configured sources, dedup against state/seen.json, emit new_items.json.
Writes updated per-source HTTP cache metadata (etag, last_modified, hwm_*) back
into state/seen.json. Does NOT touch state.seen / state.aliases — that is the
merge step's job, after Claude has classified the new items.
Usage:
SCAN_DATE=2026-04-18T14:24:43Z python -m scripts.vuln_watch.fetch_and_diff
"""
from __future__ import annotations
import argparse
import datetime
import gzip
import json
import os
import pathlib
import re
import sys
import urllib.error
import urllib.parse
import urllib.request
from typing import Any, Iterable
import feedparser # type: ignore[import-untyped]
from .sources import REQUEST_TIMEOUT, SOURCES, Source, USER_AGENT
from . import state
CVE_RE = re.compile(r"CVE-\d{4}-\d{4,7}")
DEFAULT_WINDOW_HOURS = 25
MAX_ITEMS_PER_FEED = 200
SNIPPET_MAX = 400
NEW_ITEMS_PATH = pathlib.Path("new_items.json")
def parse_iso(ts: str | None) -> datetime.datetime | None:
if not ts:
return None
try:
return datetime.datetime.fromisoformat(ts.replace("Z", "+00:00"))
except ValueError:
return None
def now_from_scan_date(scan_date: str) -> datetime.datetime:
dt = parse_iso(scan_date)
if dt is None:
dt = datetime.datetime.now(datetime.timezone.utc)
return dt
def conditional_get(
url: str,
etag: str | None,
last_modified: str | None,
user_agent: str = USER_AGENT,
) -> tuple[int | str, bytes | None, str | None, str | None]:
"""Perform a conditional GET.
Returns (status, body, new_etag, new_last_modified).
status is:
- 200 with body on success
- 304 with body=None when unchanged
- an int HTTP error code on server-side errors
- a string describing a network/transport failure
"""
req = urllib.request.Request(url, headers={
"User-Agent": user_agent,
# AMD's CDN stalls on non-gzip clients; asking for gzip speeds up
# every source and is strictly beneficial (we decompress locally).
"Accept-Encoding": "gzip",
})
if etag:
req.add_header("If-None-Match", etag)
if last_modified:
req.add_header("If-Modified-Since", last_modified)
try:
with urllib.request.urlopen(req, timeout=REQUEST_TIMEOUT) as resp:
body = resp.read()
if resp.headers.get("Content-Encoding", "").lower() == "gzip":
try:
body = gzip.decompress(body)
except OSError:
pass # server lied about encoding; use as-is
return (
resp.status,
body,
resp.headers.get("ETag"),
resp.headers.get("Last-Modified"),
)
except urllib.error.HTTPError as e:
if e.code == 304:
return (304, None, etag, last_modified)
return (e.code, None, etag, last_modified)
except (urllib.error.URLError, TimeoutError, OSError) as e:
return (f"network:{type(e).__name__}", None, etag, last_modified)
def extract_cves(text: str) -> list[str]:
seen: set[str] = set()
out: list[str] = []
for m in CVE_RE.findall(text or ""):
if m not in seen:
seen.add(m)
out.append(m)
return out
def extract_vendor_ids(text: str, patterns: Iterable[str]) -> list[str]:
seen: set[str] = set()
out: list[str] = []
for p in patterns:
for m in re.findall(p, text or ""):
if m not in seen:
seen.add(m)
out.append(m)
return out
def pick_stable_id(vendor_ids: list[str], cves: list[str], guid: str, link: str) -> str:
"""Pick canonical-ish stable ID: vendor advisory → CVE → guid → permalink.
CVE is preferred over guid/URL so that the same CVE seen via different
feeds collapses on its stable_id alone (in addition to the alias map).
"""
if vendor_ids:
return vendor_ids[0]
if cves:
return cves[0]
if guid:
return guid
return link
def clean_snippet(s: str) -> str:
s = re.sub(r"<[^>]+>", " ", s or "")
s = re.sub(r"\s+", " ", s)
return s.strip()
def _struct_time_to_iso(st: Any) -> str | None:
if not st:
return None
try:
return datetime.datetime(*st[:6], tzinfo=datetime.timezone.utc).isoformat()
except (TypeError, ValueError):
return None
def parse_feed_body(src: Source, body: bytes) -> list[dict[str, Any]]:
parsed = feedparser.parse(body)
items: list[dict[str, Any]] = []
for entry in parsed.entries[:MAX_ITEMS_PER_FEED]:
link = (entry.get("link") or "").strip()
guid = (entry.get("id") or entry.get("guid") or "").strip()
title = (entry.get("title") or "").strip()
summary = entry.get("summary") or ""
published_at = (
_struct_time_to_iso(entry.get("published_parsed"))
or _struct_time_to_iso(entry.get("updated_parsed"))
)
blob = f"{title}\n{summary}"
cves = extract_cves(blob)
vendor_ids = extract_vendor_ids(blob, src.advisory_id_patterns)
stable_id = pick_stable_id(vendor_ids, cves, guid, link)
items.append({
"source": src.name,
"stable_id": stable_id,
"title": title,
"permalink": link,
"guid": guid,
"published_at": published_at,
"extracted_cves": cves,
"vendor_ids": vendor_ids,
"snippet": clean_snippet(summary)[:SNIPPET_MAX],
})
return items
def _parse_intel_psirt(src: Source, text: str) -> list[dict[str, Any]]:
"""Intel's security-center page uses a table of <tr class="data"> rows:
<tr class="data" ...>
<td ...><a href="/.../intel-sa-NNNNN.html">Title</a></td>
<td>INTEL-SA-NNNNN</td>
<td>March 10, 2026</td> <- Last updated
<td>March 10, 2026</td> <- First published
</tr>
We pick the later of the two dates as `published_at` (most recent
activity) so updates to older advisories also show up in the window.
"""
items: list[dict[str, Any]] = []
seen_ids: set[str] = set()
permalink_base = src.display_url or src.url
for m in re.finditer(r'<tr class="data"[^>]*>(.*?)</tr>', text, re.DOTALL):
row = m.group(1)
sid = re.search(r'INTEL-SA-\d+', row)
if not sid:
continue
advisory_id = sid.group(0)
if advisory_id in seen_ids:
continue
seen_ids.add(advisory_id)
link_m = re.search(r'href="([^"#]+)"', row)
permalink = urllib.parse.urljoin(permalink_base, link_m.group(1)) if link_m else permalink_base
title_m = re.search(r'<a[^>]*>([^<]+)</a>', row)
title = title_m.group(1).strip() if title_m else advisory_id
published_at: str | None = None
for ds in re.findall(r'<td[^>]*>\s*([A-Z][a-z]+ \d{1,2}, \d{4})\s*</td>', row):
try:
dt = datetime.datetime.strptime(ds, "%B %d, %Y").replace(tzinfo=datetime.timezone.utc)
iso = dt.isoformat()
if published_at is None or iso > published_at:
published_at = iso
except ValueError:
continue
items.append({
"source": src.name,
"stable_id": advisory_id,
"title": title,
"permalink": permalink,
"guid": "",
"published_at": published_at,
"extracted_cves": extract_cves(row),
"vendor_ids": [advisory_id],
"snippet": clean_snippet(row)[:SNIPPET_MAX],
})
return items[:MAX_ITEMS_PER_FEED]
def _parse_amd_psirt(src: Source, text: str) -> list[dict[str, Any]]:
"""AMD's product-security page has a bulletin table where each row ends
with two `<td data-sort="YYYY-MM-DD HHMMSS">` cells (Published Date,
Last Updated Date). The machine-readable `data-sort` attribute is far
easier to parse than the human-readable text alongside it.
"""
items: list[dict[str, Any]] = []
seen_ids: set[str] = set()
permalink_base = src.display_url or src.url
for m in re.finditer(r'<tr[^>]*>(.*?AMD-SB-\d+.*?)</tr>', text, re.DOTALL):
row = m.group(1)
sid = re.search(r'AMD-SB-\d+', row)
if not sid:
continue
advisory_id = sid.group(0)
if advisory_id in seen_ids:
continue
seen_ids.add(advisory_id)
link_m = re.search(r'href="([^"#]+)"', row)
permalink = urllib.parse.urljoin(permalink_base, link_m.group(1)) if link_m else permalink_base
title_m = re.search(r'<a[^>]*>([^<]+)</a>', row)
title = title_m.group(1).strip() if title_m else advisory_id
published_at: str | None = None
for (y, mo, d, h, mi, s) in re.findall(
r'data-sort="(\d{4})-(\d{2})-(\d{2})\s+(\d{2})(\d{2})(\d{2})"', row
):
iso = f"{y}-{mo}-{d}T{h}:{mi}:{s}+00:00"
if published_at is None or iso > published_at:
published_at = iso
items.append({
"source": src.name,
"stable_id": advisory_id,
"title": title,
"permalink": permalink,
"guid": "",
"published_at": published_at,
"extracted_cves": extract_cves(row),
"vendor_ids": [advisory_id],
"snippet": clean_snippet(row)[:SNIPPET_MAX],
})
return items[:MAX_ITEMS_PER_FEED]
def _parse_html_generic(src: Source, text: str) -> list[dict[str, Any]]:
"""Fallback regex-only extractor for HTML sources with no known table
layout (arm-spec, transient-fail's tree.js). Emits `published_at=None`
— items pass the window filter as fail-safe, but state.seen dedup
prevents re-emission across runs."""
items: list[dict[str, Any]] = []
seen_ids: set[str] = set()
permalink_base = src.display_url or src.url
for pat in src.advisory_id_patterns:
for m in re.finditer(pat, text):
advisory_id = m.group(0)
if advisory_id in seen_ids:
continue
seen_ids.add(advisory_id)
window = text[max(0, m.start() - 400): m.end() + 400]
href_match = re.search(r'href="([^"#]+)"', window)
if href_match:
permalink = urllib.parse.urljoin(permalink_base, href_match.group(1))
else:
permalink = permalink_base
cves_in_window = extract_cves(window)
is_cve = advisory_id.startswith("CVE-")
cves = cves_in_window if not is_cve else list({advisory_id, *cves_in_window})
vendor_ids = [] if is_cve else [advisory_id]
items.append({
"source": src.name,
"stable_id": advisory_id,
"title": advisory_id,
"permalink": permalink,
"guid": "",
"published_at": None,
"extracted_cves": cves,
"vendor_ids": vendor_ids,
"snippet": clean_snippet(window)[:SNIPPET_MAX],
})
return items[:MAX_ITEMS_PER_FEED]
_HTML_PARSERS = {
"intel-psirt": _parse_intel_psirt,
"amd-psirt": _parse_amd_psirt,
}
def parse_html_body(src: Source, body: bytes) -> list[dict[str, Any]]:
"""Dispatch to a per-source HTML parser when one is registered;
fall back to the generic regex-over-advisory-IDs extractor."""
text = body.decode("utf-8", errors="replace")
parser = _HTML_PARSERS.get(src.name, _parse_html_generic)
return parser(src, text)
def parse_body(src: Source, body: bytes) -> list[dict[str, Any]]:
return parse_feed_body(src, body) if src.kind in ("rss", "atom") else parse_html_body(src, body)
def compute_cutoff(
scan_now: datetime.datetime,
last_run: str | None,
window_hours: float = DEFAULT_WINDOW_HOURS,
) -> datetime.datetime:
base = scan_now - datetime.timedelta(hours=window_hours)
lr = parse_iso(last_run)
if lr is None:
return base
widened = scan_now - (scan_now - lr + datetime.timedelta(hours=1))
return min(base, widened)
def _resolve_window_hours() -> float:
"""Pick up WINDOW_HOURS from the environment (set by workflow_dispatch).
Falls back to DEFAULT_WINDOW_HOURS for cron runs or local invocations."""
raw = os.environ.get("WINDOW_HOURS", "").strip()
if not raw:
return float(DEFAULT_WINDOW_HOURS)
try:
v = float(raw)
if v <= 0:
raise ValueError("must be > 0")
return v
except ValueError:
print(f"warning: ignoring invalid WINDOW_HOURS={raw!r}, using {DEFAULT_WINDOW_HOURS}",
file=sys.stderr)
return float(DEFAULT_WINDOW_HOURS)
def backlog_to_reconsider(data: dict[str, Any]) -> list[dict[str, Any]]:
"""Walk state.seen and emit toimplement/tocheck entries for re-review.
Each entry carries enough context that Claude can re-grep ./checker/
and decide whether the prior classification still holds. Items in
`unrelated` are skipped — those are settled.
A CVE alias pointing at this canonical is included in `extracted_cves`
so Claude sees every known CVE for the item without having to consult
the full alias map.
"""
seen = data.get("seen", {})
aliases = data.get("aliases", {})
# Reverse-index aliases: canonical -> [alt, ...]
by_canonical: dict[str, list[str]] = {}
for alt, canon in aliases.items():
by_canonical.setdefault(canon, []).append(alt)
out: list[dict[str, Any]] = []
for canonical, rec in seen.items():
if rec.get("bucket") not in ("toimplement", "tocheck"):
continue
cves: list[str] = []
if canonical.startswith("CVE-"):
cves.append(canonical)
for alt in by_canonical.get(canonical, []):
if alt.startswith("CVE-") and alt not in cves:
cves.append(alt)
out.append({
"canonical_id": canonical,
"current_bucket": rec.get("bucket"),
"title": rec.get("title") or "",
"sources": list(rec.get("sources") or []),
"urls": list(rec.get("urls") or []),
"extracted_cves": cves,
"first_seen": rec.get("first_seen"),
})
return out
def candidate_ids(item: dict[str, Any]) -> list[str]:
"""All identifiers under which this item might already be known."""
seen: set[str] = set()
out: list[str] = []
for cand in (
*(item.get("extracted_cves") or []),
*(item.get("vendor_ids") or []),
item.get("stable_id"),
item.get("guid"),
item.get("permalink"),
):
if cand and cand not in seen:
seen.add(cand)
out.append(cand)
return out
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--scan-date", default=os.environ.get("SCAN_DATE", ""))
ap.add_argument("--output", type=pathlib.Path, default=NEW_ITEMS_PATH)
args = ap.parse_args()
scan_now = now_from_scan_date(args.scan_date)
scan_date_iso = scan_now.isoformat()
window_hours = _resolve_window_hours()
data = state.load()
cutoff = compute_cutoff(scan_now, data.get("last_run"), window_hours)
per_source: dict[str, dict[str, Any]] = {}
all_new: list[dict[str, Any]] = []
for src in SOURCES:
meta = dict(data["sources"].get(src.name, {}))
status, body, etag, last_modified = conditional_get(
src.url, meta.get("etag"), meta.get("last_modified"),
user_agent=src.user_agent or USER_AGENT,
)
meta["last_fetched_at"] = scan_date_iso
meta["last_status"] = status
if isinstance(status, str) or (isinstance(status, int) and status >= 400 and status != 304):
per_source[src.name] = {"status": status, "new": 0}
data["sources"][src.name] = meta
continue
if status == 304 or body is None:
per_source[src.name] = {"status": 304, "new": 0}
data["sources"][src.name] = meta
continue
# Refresh cache headers only on successful 200.
if etag:
meta["etag"] = etag
if last_modified:
meta["last_modified"] = last_modified
items = parse_body(src, body)
total = len(items)
in_window = []
for it in items:
pub = parse_iso(it.get("published_at"))
if pub is None or pub >= cutoff:
in_window.append(it)
new: list[dict[str, Any]] = []
hwm_pub = meta.get("hwm_published_at")
hwm_id = meta.get("hwm_id")
for it in in_window:
if state.lookup(data, candidate_ids(it)) is not None:
continue
new.append(it)
pub = it.get("published_at")
if pub and (not hwm_pub or pub > hwm_pub):
hwm_pub = pub
hwm_id = it.get("stable_id")
if new:
meta["hwm_published_at"] = hwm_pub
meta["hwm_id"] = hwm_id
data["sources"][src.name] = meta
per_source[src.name] = {"status": status, "new": len(new), "total_in_feed": total}
all_new.extend(new)
# Persist updated HTTP cache metadata regardless of whether Claude runs.
state.save(data)
reconsider = backlog_to_reconsider(data)
out = {
"scan_date": scan_date_iso,
"window_cutoff": cutoff.isoformat(),
"per_source": per_source,
"items": all_new,
"reconsider": reconsider,
}
args.output.write_text(json.dumps(out, indent=2, sort_keys=True) + "\n")
# GitHub Actions step outputs. Downstream `if:` conditions gate the
# classify step on `new_count || reconsider_count`; both must be 0
# for Claude to be skipped.
gh_out = os.environ.get("GITHUB_OUTPUT")
if gh_out:
with open(gh_out, "a") as f:
f.write(f"new_count={len(all_new)}\n")
f.write(f"reconsider_count={len(reconsider)}\n")
failures = [
s for s, v in per_source.items()
if not (isinstance(v["status"], int) and v["status"] in (200, 304))
]
f.write(f"fetch_failures_count={len(failures)}\n")
print(f"Scan date: {scan_date_iso}")
print(f"Window: {window_hours:g} h")
print(f"Cutoff: {cutoff.isoformat()}")
print(f"New items: {len(all_new)}")
print(f"Reconsider: {len(reconsider)} existing toimplement/tocheck entries")
for s, v in per_source.items():
print(f" {s:14s} status={str(v['status']):>16} new={v['new']}")
return 0
if __name__ == "__main__":
sys.exit(main())