flairNLP · MaxDall · Jun 16, 2026
diff --git a/skills/review-publisher/PLAYBOOK.md b/skills/review-publisher/PLAYBOOK.md
diff --git a/skills/review-publisher/SKILL.md b/skills/review-publisher/SKILL.md
@@ -0,0 +1,33 @@
+---
+name: review-publisher
+description: >-
+  Review a Fundus publisher PR — one that adds a new publisher or adds/changes a parser version.
+  Crawls live articles to verify the extracted ArticleBody mirrors the real article (no missing or
+  leaked content), checks VALID_UNTIL / version bumps / validate=False attributes / free_access, and
+  drafts a single GitHub review. Use when asked to review a publisher or parser PR.
+---
+
+# Review a Publisher PR
+
+**This skill's directory is `${CLAUDE_SKILL_DIR}`** — that placeholder is substituted *only here in
+SKILL.md*, so note the literal path above now. The procedure lives in the sibling
+[`PLAYBOOK.md`](PLAYBOOK.md); wherever its commands write `<skill>`, substitute the literal path.
+The bundled driver — the only tool you need — is:
+
+    python "<skill>/scripts/review.py" {crawl,sweep,show,adjudicate,status,payload} <cc>.<Class>
+
+Open the PLAYBOOK and work through §1–§5; it's the source of truth. These are the guardrails to
+hold onto even before you open it:
+
+- **No PR named? Resolve it first** — read the current branch's PR (`gh pr view`), confirm it, or ask.
+  Never guess or default to the latest.
+- **Don't run `pytest` / `mypy` / `ruff`** — CI covers them; your value-add is live-article correctness.
+- **The body must mirror the article** — no dropped paragraphs, no leaked boilerplate. The driver's
+  crawl-once-then-sweep flow (§2) is a hard gate on **every** publisher: every candidate it surfaces
+  must be explicitly adjudicated (`adjudicate <id> ok|blocker --note ...`), and `status` must report
+  READY before any verdict. The judgment calls stay yours; skipping them silently does not.
+- **Each publisher is its own review** — its own crawl, sweep, and `status READY`; a blocker on one
+  does not discharge the checks on the rest.
+- **Verdict:** any blocker → `REQUEST_CHANGES`, else `COMMENT`. Never `APPROVE`; your own PR → `COMMENT`.
+- **Keep the review tight, and confirm before posting** — one skimmable review, no evidence stated
+  twice (§5); show `review.json` to the user before the `gh api` POST.
diff --git a/skills/review-publisher/requirements.txt b/skills/review-publisher/requirements.txt
@@ -0,0 +1,5 @@
+# Sampler dependencies beyond Fundus itself. Not part of the shipped package's runtime deps.
+# Pinned for Python 3.8 (the repo's CI target).
+numpy==1.24.4
+scikit-learn==1.3.2
+# lxml is already a Fundus dependency.
diff --git a/skills/review-publisher/scripts/_store.py b/skills/review-publisher/scripts/_store.py
@@ -0,0 +1,196 @@
+"""Persistence for the publisher-review driver (`review.py`): one cache dir per review.
+
+The cache dir holds the raw crawled bytes (`NN.html`) plus a single `state.json` that is
+the source of truth for the whole review: crawl parameters, the per-article records, the
+sweep's candidates, and the agent's adjudications. Everything `review.py` knows it knows
+from here, which is what makes the workflow crash-safe (state is rewritten after every
+article) and gateable (`payload_gaps` can name exactly what is still missing).
+
+Layout of a cache dir (default: <tempdir>/fundus-review/<cc>.<Class>/):
+
+    state.json   # crawl meta + article records + sweep candidates + adjudications
+    01.html      # raw html.content bytes for article 1 (exact crawled bytes)
+    02.html      # ...
+"""
+
+import hashlib
+import json
+import shutil
+import tempfile
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from fundus import Article, PublisherCollection
+from fundus.publishers.base_objects import Publisher
+
+STATE_FILE = "state.json"
+
+# Adjudication verdicts: "ok" = benign (boilerplate outside the body for a drop candidate,
+# legitimately repeated content for a leak candidate); "blocker" = a real finding.
+VERDICTS = ("ok", "blocker")
+
+
+# --- publisher + path resolution ---
+
+
+def resolve_publisher(spec: str) -> Publisher:
+    """`ca.NationalPost` -> the Publisher object on PublisherCollection."""
+    try:
+        cc, name = spec.split(".", 1)
+    except ValueError:
+        raise SystemExit(f"publisher spec must be '<cc>.<Class>', got {spec!r}")
+    region = getattr(PublisherCollection, cc, None)
+    if region is None:
+        raise SystemExit(f"no such country code on PublisherCollection: {cc!r}")
+    publisher = getattr(region, name, None)
+    if not isinstance(publisher, Publisher):
+        raise SystemExit(f"no publisher {name!r} under PublisherCollection.{cc}")
+    return publisher
+
+
+def default_cache_dir(spec: str) -> Path:
+    """Predictable per-publisher temp location, so every subcommand agrees without a path."""
+    return Path(tempfile.gettempdir()) / "fundus-review" / spec
+
+
+def resolve_cache_dir(spec: str, provided: Optional[str]) -> Path:
+    return Path(provided) if provided else default_cache_dir(spec)
+
+
+def prepare_cache_dir(cache_dir: Path) -> None:
+    """Wipe and recreate `cache_dir` for a fresh crawl — refusing anything that isn't a review cache.
+
+    The guard is what makes `--cache-dir` safe: a non-empty directory without a
+    `state.json` (someone's working tree, a typo'd path) is never deleted.
+    """
+    if cache_dir.exists():
+        if any(cache_dir.iterdir()) and not (cache_dir / STATE_FILE).exists():
+            raise SystemExit(
+                f"refusing to wipe {cache_dir}: non-empty and no {STATE_FILE}, so it doesn't look like "
+                f"a review cache. Use an empty or not-yet-existing --cache-dir."
+            )
+        shutil.rmtree(cache_dir)
+    cache_dir.mkdir(parents=True, exist_ok=True)
+
+
+# --- state file ---
+
+
+def new_state(spec: str, pool: int) -> Dict[str, Any]:
+    return {
+        "publisher": spec,
+        "crawl": {"pool": pool, "started": time.time(), "finished": None, "completed": False},
+        "articles": [],
+        "sweep": None,
+        "adjudications": {},
+    }
+
+
+def write_state(cache_dir: Path, state: Dict[str, Any]) -> None:
+    """Atomically (write-then-replace) persist the state, so a crash never corrupts it."""
+    tmp = cache_dir / (STATE_FILE + ".tmp")
+    tmp.write_text(json.dumps(state, ensure_ascii=False, indent=2), encoding="utf-8")
+    tmp.replace(cache_dir / STATE_FILE)
+
+
+def load_state(cache_dir: Path) -> Optional[Dict[str, Any]]:
+    state_file = cache_dir / STATE_FILE
+    if not state_file.exists():
+        return None
+    state: Dict[str, Any] = json.loads(state_file.read_text(encoding="utf-8"))
+    return state
+
+
+# --- article records ---
+
+
+def html_filename(index: int) -> str:
+    return f"{index:02d}.html"
+
+
+def save_article(cache_dir: Path, index: int, article: Article) -> Dict[str, Any]:
+    """Write one article's raw bytes and return its state record.
+
+    `write_bytes` keeps the cached file the *exact* crawled bytes — text mode would
+    newline-translate on Windows (\\r\\n -> \\r\\r\\n on disk).
+    """
+    (cache_dir / html_filename(index)).write_bytes(article.html.content.encode("utf-8"))
+    body = article.body
+    return {
+        "index": index,
+        "url": article.html.requested_url,
+        "crawl_date": article.html.crawl_date.isoformat(),
+        "title": article.title,
+        "authors": list(article.authors),
+        "topics": list(article.topics),
+        "images": len(article.images),
+        "body": body.serialize() if body is not None else None,
+        "html_file": html_filename(index),
+    }
+
+
+def read_html(cache_dir: Path, record: Dict[str, Any]) -> str:
+    return (cache_dir / str(record["html_file"])).read_bytes().decode("utf-8")
+
+
+def record_crawl_date(record: Dict[str, Any]) -> datetime:
+    return datetime.fromisoformat(str(record["crawl_date"]))
+
+
+def body_units(serialized_body: Optional[Dict[str, Any]]) -> List[str]:
+    """Flatten a serialized ArticleBody into its text units (summary, headlines, paragraphs)."""
+    if not serialized_body:
+        return []
+    units: List[str] = list(serialized_body.get("summary") or [])
+    for section in serialized_body.get("sections") or []:
+        units.extend(section.get("headline") or [])
+        units.extend(section.get("paragraphs") or [])
+    return units
+
+
+# --- candidates + adjudication gate ---
+
+
+def candidate_id(kind: str, *key_parts: str) -> str:
+    """Stable short id from the candidate's content, so re-sweeps keep existing adjudications."""
+    digest = hashlib.sha1("\x1f".join(key_parts).encode("utf-8")).hexdigest()[:6]
+    return f"{kind[0].upper()}{digest}"
+
+
+def candidates(state: Dict[str, Any]) -> List[Dict[str, Any]]:
+    sweep = state.get("sweep") or {}
+    result: List[Dict[str, Any]] = sweep.get("candidates") or []
+    return result
+
+
+def pending_candidates(state: Dict[str, Any]) -> List[Dict[str, Any]]:
+    adjudicated = state.get("adjudications") or {}
+    return [c for c in candidates(state) if c["id"] not in adjudicated]
+
+
+def blocker_candidates(state: Dict[str, Any]) -> List[Dict[str, Any]]:
+    adjudications = state.get("adjudications") or {}
+    return [c for c in candidates(state) if adjudications.get(c["id"], {}).get("verdict") == "blocker"]
+
+
+def payload_gaps(state: Dict[str, Any]) -> List[str]:
+    """Every reason the review is not ready to be written up; empty means the gate is open."""
+    gaps: List[str] = []
+    crawl = state.get("crawl") or {}
+    if not crawl.get("completed"):
+        gaps.append("the crawl did not complete (interrupted?) — re-run `crawl`")
+    if not state.get("articles"):
+        gaps.append("no articles in the cache — 0 crawled is itself a blocker-level finding (see PLAYBOOK §2)")
+    sweep = state.get("sweep")
+    if not sweep:
+        gaps.append("no sweep recorded — run `sweep`")
+    else:
+        if crawl.get("finished") and sweep.get("swept_at", 0) < crawl["finished"]:
+            gaps.append("the sweep predates the last crawl — re-run `sweep`")
+        pending = pending_candidates(state)
+        if pending:
+            ids = ", ".join(c["id"] for c in pending[:15])
+            gaps.append(f"{len(pending)} candidate(s) un-adjudicated: {ids}")
+    return gaps