Add Stage 2.8 recall, quality gate, retries, and publish idempotency

This commit is contained in:
Mimikko-zeus
2026-06-10 21:31:13 +08:00
parent 07786e3bc0
commit b46cef2c7b
16 changed files with 1253 additions and 6 deletions

View File

@@ -22,8 +22,128 @@ class RunnerTests(unittest.TestCase):
run_dir = Path(result["run_dir"])
self.assertTrue((run_dir / "blog_markdown.md").exists())
self.assertTrue((run_dir / "run_report.json").exists())
for filename in [
"stage0_sources.json",
"stage1_items.json",
"stage2_items.json",
"stage2_5_items.json",
"stage2_8_candidates.json",
"stage3_items.json",
"stage4_items.json",
"quality_gate.json",
]:
self.assertTrue((run_dir / filename).exists(), filename)
self.assertEqual(result["reports"]["stage8"]["status"], "ok")
def test_run_daily_report_passes_pipeline_config_to_stage_functions(self):
class FakeLlmClient:
def chat(self, prompt):
payload = json.loads(prompt)
if "candidates" in payload:
first_candidate = payload["candidates"][0]["item_ids"]
return json.dumps(
{
"duplicate_groups": [
{
"keep_id": first_candidate[0],
"remove_ids": [first_candidate[1]],
"confidence": "high",
"reason": "same event",
}
],
"not_duplicates": [],
"uncertain": [],
}
)
if "allowed_sections" in payload:
return json.dumps(
{
"rewrites": [
{
"id": item["id"],
"title": item["title_raw"],
"summary": item["summary_raw"],
"flags": [],
}
for item in payload["items"]
]
}
)
return json.dumps(
{
"intro": "Daily intro.",
"theme": "Pipeline config.",
"threads": [
{
"title": "Config thread",
"text": "Config values reached the pipeline.",
"item_ids": [payload["items"][0]["id"]],
"kind": "thread",
}
],
"conclusion": "Done.",
}
)
with TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
pipeline_config = temp_path / "pipeline.json"
pipeline_config.write_text(
json.dumps(
{
"semantic_dedup_max_deletion_ratio": 0.1,
"rewrite_batch_size": 1,
"cross_day_dedup": {"enabled": False},
}
),
encoding="utf-8",
)
source_config = temp_path / "sources.json"
source_config.write_text(
json.dumps(
[
{
"name": "AI HOT",
"type": "rss",
"url": "https://feed.example/rss",
"role": "primary",
"priority": 10,
"enabled": True,
}
]
),
encoding="utf-8",
)
def fetch_text(url, timeout):
return """<?xml version="1.0"?><rss><channel>
<item><title>Anthropic launches Claude Code</title><link>https://example.com/a</link><description>Anthropic launches Claude Code for developers.</description></item>
<item><title>Anthropic launch Claude Code</title><link>https://example.com/b</link><description>Anthropic launch Claude Code for coding.</description></item>
<item><title>Gemini CLI update</title><link>https://example.com/c</link><description>Google updates Gemini CLI.</description></item>
</channel></rss>"""
result = run_daily_report(
run_date="2026-06-10",
mode="dry-run",
source_mode="live",
llm_mode="live",
out_dir=temp_path / "out",
base_url="https://blog.example",
sources_path=source_config,
pipeline_path=pipeline_config,
fetch_text=fetch_text,
env={
"LLM_API_KEY": "test-key",
"LLM_BASE_URL": "https://llm.example/v1",
"LLM_MODEL": "test-model",
},
llm_client_factory=lambda **config: FakeLlmClient(),
)
self.assertTrue(result["reports"]["stage3"]["skipped_for_deletion_ratio"])
self.assertEqual(result["reports"]["stage4"]["batch_count"], 3)
self.assertIn("quality_gate", result["reports"])
def test_run_daily_report_live_sources_can_use_config_and_fetch_text(self):
with TemporaryDirectory() as temp_dir:
out_dir = Path(temp_dir) / "out"