Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
307 changes: 307 additions & 0 deletions skills/review-publisher/PLAYBOOK.md

Large diffs are not rendered by default.

33 changes: 33 additions & 0 deletions skills/review-publisher/SKILL.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
---
name: review-publisher
description: >-
Review a Fundus publisher PR — one that adds a new publisher or adds/changes a parser version.
Crawls live articles to verify the extracted ArticleBody mirrors the real article (no missing or
leaked content), checks VALID_UNTIL / version bumps / validate=False attributes / free_access, and
drafts a single GitHub review. Use when asked to review a publisher or parser PR.
---

# Review a Publisher PR

**This skill's directory is `${CLAUDE_SKILL_DIR}`** — that placeholder is substituted *only here in
SKILL.md*, so note the literal path above now. The procedure lives in the sibling
[`PLAYBOOK.md`](PLAYBOOK.md); wherever its commands write `<skill>`, substitute the literal path.
The bundled driver — the only tool you need — is:

python "<skill>/scripts/review.py" {crawl,sweep,show,adjudicate,status,payload} <cc>.<Class>

Open the PLAYBOOK and work through §1–§5; it's the source of truth. These are the guardrails to
hold onto even before you open it:

- **No PR named? Resolve it first** — read the current branch's PR (`gh pr view`), confirm it, or ask.
Never guess or default to the latest.
- **Don't run `pytest` / `mypy` / `ruff`** — CI covers them; your value-add is live-article correctness.
- **The body must mirror the article** — no dropped paragraphs, no leaked boilerplate. The driver's
crawl-once-then-sweep flow (§2) is a hard gate on **every** publisher: every candidate it surfaces
must be explicitly adjudicated (`adjudicate <id> ok|blocker --note ...`), and `status` must report
READY before any verdict. The judgment calls stay yours; skipping them silently does not.
- **Each publisher is its own review** — its own crawl, sweep, and `status READY`; a blocker on one
does not discharge the checks on the rest.
- **Verdict:** any blocker → `REQUEST_CHANGES`, else `COMMENT`. Never `APPROVE`; your own PR → `COMMENT`.
- **Keep the review tight, and confirm before posting** — one skimmable review, no evidence stated
twice (§5); show `review.json` to the user before the `gh api` POST.
5 changes: 5 additions & 0 deletions skills/review-publisher/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Sampler dependencies beyond Fundus itself. Not part of the shipped package's runtime deps.
# Pinned for Python 3.8 (the repo's CI target).
numpy==1.24.4
scikit-learn==1.3.2
# lxml is already a Fundus dependency.
196 changes: 196 additions & 0 deletions skills/review-publisher/scripts/_store.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
"""Persistence for the publisher-review driver (`review.py`): one cache dir per review.

The cache dir holds the raw crawled bytes (`NN.html`) plus a single `state.json` that is
the source of truth for the whole review: crawl parameters, the per-article records, the
sweep's candidates, and the agent's adjudications. Everything `review.py` knows it knows
from here, which is what makes the workflow crash-safe (state is rewritten after every
article) and gateable (`payload_gaps` can name exactly what is still missing).

Layout of a cache dir (default: <tempdir>/fundus-review/<cc>.<Class>/):

state.json # crawl meta + article records + sweep candidates + adjudications
01.html # raw html.content bytes for article 1 (exact crawled bytes)
02.html # ...
"""

import hashlib
import json
import shutil
import tempfile
import time
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional

from fundus import Article, PublisherCollection
from fundus.publishers.base_objects import Publisher

STATE_FILE = "state.json"

# Adjudication verdicts: "ok" = benign (boilerplate outside the body for a drop candidate,
# legitimately repeated content for a leak candidate); "blocker" = a real finding.
VERDICTS = ("ok", "blocker")


# --- publisher + path resolution ---


def resolve_publisher(spec: str) -> Publisher:
"""`ca.NationalPost` -> the Publisher object on PublisherCollection."""
try:
cc, name = spec.split(".", 1)
except ValueError:
raise SystemExit(f"publisher spec must be '<cc>.<Class>', got {spec!r}")
region = getattr(PublisherCollection, cc, None)
if region is None:
raise SystemExit(f"no such country code on PublisherCollection: {cc!r}")
publisher = getattr(region, name, None)
if not isinstance(publisher, Publisher):
raise SystemExit(f"no publisher {name!r} under PublisherCollection.{cc}")
return publisher


def default_cache_dir(spec: str) -> Path:
"""Predictable per-publisher temp location, so every subcommand agrees without a path."""
return Path(tempfile.gettempdir()) / "fundus-review" / spec


def resolve_cache_dir(spec: str, provided: Optional[str]) -> Path:
return Path(provided) if provided else default_cache_dir(spec)


def prepare_cache_dir(cache_dir: Path) -> None:
"""Wipe and recreate `cache_dir` for a fresh crawl — refusing anything that isn't a review cache.

The guard is what makes `--cache-dir` safe: a non-empty directory without a
`state.json` (someone's working tree, a typo'd path) is never deleted.
"""
if cache_dir.exists():
if any(cache_dir.iterdir()) and not (cache_dir / STATE_FILE).exists():
raise SystemExit(
f"refusing to wipe {cache_dir}: non-empty and no {STATE_FILE}, so it doesn't look like "
f"a review cache. Use an empty or not-yet-existing --cache-dir."
)
shutil.rmtree(cache_dir)
cache_dir.mkdir(parents=True, exist_ok=True)


# --- state file ---


def new_state(spec: str, pool: int) -> Dict[str, Any]:
return {
"publisher": spec,
"crawl": {"pool": pool, "started": time.time(), "finished": None, "completed": False},
"articles": [],
"sweep": None,
"adjudications": {},
}


def write_state(cache_dir: Path, state: Dict[str, Any]) -> None:
"""Atomically (write-then-replace) persist the state, so a crash never corrupts it."""
tmp = cache_dir / (STATE_FILE + ".tmp")
tmp.write_text(json.dumps(state, ensure_ascii=False, indent=2), encoding="utf-8")
tmp.replace(cache_dir / STATE_FILE)


def load_state(cache_dir: Path) -> Optional[Dict[str, Any]]:
state_file = cache_dir / STATE_FILE
if not state_file.exists():
return None
state: Dict[str, Any] = json.loads(state_file.read_text(encoding="utf-8"))
return state


# --- article records ---


def html_filename(index: int) -> str:
return f"{index:02d}.html"


def save_article(cache_dir: Path, index: int, article: Article) -> Dict[str, Any]:
"""Write one article's raw bytes and return its state record.

`write_bytes` keeps the cached file the *exact* crawled bytes — text mode would
newline-translate on Windows (\\r\\n -> \\r\\r\\n on disk).
"""
(cache_dir / html_filename(index)).write_bytes(article.html.content.encode("utf-8"))
body = article.body
return {
"index": index,
"url": article.html.requested_url,
"crawl_date": article.html.crawl_date.isoformat(),
"title": article.title,
"authors": list(article.authors),
"topics": list(article.topics),
"images": len(article.images),
"body": body.serialize() if body is not None else None,
"html_file": html_filename(index),
}


def read_html(cache_dir: Path, record: Dict[str, Any]) -> str:
return (cache_dir / str(record["html_file"])).read_bytes().decode("utf-8")


def record_crawl_date(record: Dict[str, Any]) -> datetime:
return datetime.fromisoformat(str(record["crawl_date"]))


def body_units(serialized_body: Optional[Dict[str, Any]]) -> List[str]:
"""Flatten a serialized ArticleBody into its text units (summary, headlines, paragraphs)."""
if not serialized_body:
return []
units: List[str] = list(serialized_body.get("summary") or [])
for section in serialized_body.get("sections") or []:
units.extend(section.get("headline") or [])
units.extend(section.get("paragraphs") or [])
return units


# --- candidates + adjudication gate ---


def candidate_id(kind: str, *key_parts: str) -> str:
"""Stable short id from the candidate's content, so re-sweeps keep existing adjudications."""
digest = hashlib.sha1("\x1f".join(key_parts).encode("utf-8")).hexdigest()[:6]
return f"{kind[0].upper()}{digest}"


def candidates(state: Dict[str, Any]) -> List[Dict[str, Any]]:
sweep = state.get("sweep") or {}
result: List[Dict[str, Any]] = sweep.get("candidates") or []
return result


def pending_candidates(state: Dict[str, Any]) -> List[Dict[str, Any]]:
adjudicated = state.get("adjudications") or {}
return [c for c in candidates(state) if c["id"] not in adjudicated]


def blocker_candidates(state: Dict[str, Any]) -> List[Dict[str, Any]]:
adjudications = state.get("adjudications") or {}
return [c for c in candidates(state) if adjudications.get(c["id"], {}).get("verdict") == "blocker"]


def payload_gaps(state: Dict[str, Any]) -> List[str]:
"""Every reason the review is not ready to be written up; empty means the gate is open."""
gaps: List[str] = []
crawl = state.get("crawl") or {}
if not crawl.get("completed"):
gaps.append("the crawl did not complete (interrupted?) — re-run `crawl`")
if not state.get("articles"):
gaps.append("no articles in the cache — 0 crawled is itself a blocker-level finding (see PLAYBOOK §2)")
sweep = state.get("sweep")
if not sweep:
gaps.append("no sweep recorded — run `sweep`")
else:
if crawl.get("finished") and sweep.get("swept_at", 0) < crawl["finished"]:
gaps.append("the sweep predates the last crawl — re-run `sweep`")
pending = pending_candidates(state)
if pending:
ids = ", ".join(c["id"] for c in pending[:15])
gaps.append(f"{len(pending)} candidate(s) un-adjudicated: {ids}")
return gaps
Loading