Add Stage 2.8 recall, quality gate, retries, and publish idempotency

2026-06-10 21:31:13 +08:00
parent 07786e3bc0
commit b46cef2c7b
16 changed files with 1253 additions and 6 deletions
--- a/tests/test_candidate_recall.py
+++ b/tests/test_candidate_recall.py
@@ -0,0 +1,79 @@
+import unittest
+
+from ai_daily_report.candidate_recall import recall_semantic_candidates
+from ai_daily_report.models import NewsItem
+from ai_daily_report.normalize import normalize_title
+
+
+def item(item_id, title, summary):
+    return NewsItem(
+        id=item_id,
+        source_group="AI HOT",
+        source_label="AI HOT",
+        source_role="primary",
+        source_priority=10,
+        title_raw=title,
+        title_norm=normalize_title(title),
+        summary_raw=summary,
+        url=f"https://example.com/{item_id}",
+        canonical_url=f"https://example.com/{item_id}",
+    )
+
+
+class CandidateRecallTests(unittest.TestCase):
+    def test_recalls_shared_event_entities_when_titles_are_not_stage2_similar(self):
+        items = [
+            item(
+                "a",
+                "Anthropic 被曝开发 Claude Fable",
+                "Anthropic 正在开发名为 Claude Fable 和 Claude Mythos 的新产品。",
+            ),
+            item(
+                "b",
+                "Claude Mythos 进入内部测试",
+                "Anthropic 的 Claude Mythos 与 Claude Fable 面向内容生成场景。",
+            ),
+            item(
+                "c",
+                "Gemini CLI 发布更新",
+                "Google 为 Gemini CLI 增加新的开发者命令。",
+            ),
+        ]
+
+        candidates, report = recall_semantic_candidates(items, existing_candidates=[])
+
+        candidate_sets = [set(candidate["item_ids"]) for candidate in candidates]
+        self.assertIn({"a", "b"}, candidate_sets)
+        self.assertNotIn({"a", "c"}, candidate_sets)
+        self.assertEqual(report["candidate_group_count"], 1)
+        self.assertEqual(candidates[0]["reason"], "strong_entity_overlap")
+
+    def test_does_not_group_same_company_different_products_without_event_overlap(self):
+        items = [
+            item("gemini", "Google 发布 Gemini CLI", "Google 发布面向开发者的 Gemini CLI 工具。"),
+            item("gemma", "Google 开源 Gemma 3n", "Google 开源 Gemma 3n 模型，面向端侧部署。"),
+        ]
+
+        candidates, report = recall_semantic_candidates(items, existing_candidates=[])
+
+        self.assertEqual(candidates, [])
+        self.assertEqual(report["candidate_group_count"], 0)
+
+    def test_preserves_existing_candidates_and_adds_new_ones_without_duplicates(self):
+        items = [
+            item("a", "Anthropic 发布 Claude Fable", "Claude Fable 与 Claude Mythos 同时曝光。"),
+            item("b", "Claude Mythos 新功能曝光", "Claude Mythos 和 Claude Fable 是 Anthropic 新项目。"),
+        ]
+
+        candidates, report = recall_semantic_candidates(
+            items,
+            existing_candidates=[{"item_ids": ["a", "b"], "reason": "title_similarity"}],
+        )
+
+        self.assertEqual(len(candidates), 1)
+        self.assertEqual(candidates[0]["reason"], "title_similarity")
+        self.assertEqual(report["existing_candidate_group_count"], 1)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_clients.py
+++ b/tests/test_clients.py
@@ -1,8 +1,9 @@
 import json
 import unittest
+from urllib.error import HTTPError
 from unittest.mock import patch

-from ai_daily_report.clients import BlogApiClient, OpenAICompatibleClient, fetch_text
+from ai_daily_report.clients import FetchTextError, BlogApiClient, OpenAICompatibleClient, fetch_text


 class FakeResponse:
@@ -26,6 +27,28 @@ class ClientTests(unittest.TestCase):
        with patch("urllib.request.urlopen", return_value=FakeResponse("ok".encode("utf-8"))):
            self.assertEqual(fetch_text("https://example.com", 1), "ok")

+    def test_fetch_text_retries_transient_http_errors(self):
+        responses = [
+            HTTPError("https://example.com", 503, "Service Unavailable", {}, None),
+            FakeResponse("ok".encode("utf-8")),
+        ]
+        with patch("urllib.request.urlopen", side_effect=responses) as urlopen:
+            self.assertEqual(fetch_text("https://example.com", 1, retries=1, backoff_seconds=0), "ok")
+
+        self.assertEqual(urlopen.call_count, 2)
+
+    def test_fetch_text_does_not_retry_404_and_classifies_error(self):
+        with patch(
+            "urllib.request.urlopen",
+            side_effect=HTTPError("https://example.com", 404, "Not Found", {}, None),
+        ) as urlopen:
+            with self.assertRaises(FetchTextError) as context:
+                fetch_text("https://example.com", 1, retries=2, backoff_seconds=0)
+
+        self.assertEqual(urlopen.call_count, 1)
+        self.assertEqual(context.exception.error_type, "http_404")
+        self.assertEqual(context.exception.http_status, 404)
+
    def test_openai_compatible_client_returns_message_content(self):
        body = json.dumps({"choices": [{"message": {"content": "hello"}}]}).encode("utf-8")
        with patch("urllib.request.urlopen", return_value=FakeResponse(body)):
--- a/tests/test_quality_gate.py
+++ b/tests/test_quality_gate.py
@@ -0,0 +1,78 @@
+import unittest
+
+from ai_daily_report.models import NewsItem, SourceResult
+from ai_daily_report.quality_gate import evaluate_quality_gate
+
+
+def news_item(item_id, title="Story"):
+    return NewsItem(
+        id=item_id,
+        source_group="AI HOT",
+        source_label="AI HOT",
+        source_role="primary",
+        source_priority=10,
+        title_raw=f"{title} {item_id}",
+        title_norm=f"{title} {item_id}".lower(),
+        summary_raw="summary",
+        url=f"https://example.com/{item_id}",
+        canonical_url=f"https://example.com/{item_id}",
+    )
+
+
+class QualityGateTests(unittest.TestCase):
+    def test_warns_when_stage3_candidates_zero_for_large_item_set(self):
+        items = [news_item(str(index)) for index in range(31)]
+        report = evaluate_quality_gate(
+            items,
+            source_results=[],
+            reports={"stage3": {"candidate_group_count": 0}},
+            config={"warn_when_stage3_candidates_zero_min_items": 30},
+        )
+
+        self.assertIn("stage3_candidates_zero", report["warnings"])
+        self.assertEqual(report["blocking_errors"], [])
+
+    def test_warns_on_enabled_source_failure(self):
+        report = evaluate_quality_gate(
+            [news_item("a")],
+            source_results=[
+                SourceResult(
+                    source="橘鸦AI早报",
+                    role="supplement",
+                    ok=False,
+                    status="error",
+                    error="HTTPError: 404",
+                )
+            ],
+            reports={"stage3": {"candidate_group_count": 1}},
+            config={"warn_on_enabled_source_failure": True},
+        )
+
+        self.assertIn("enabled_source_failed:橘鸦AI早报:error", report["warnings"])
+        self.assertEqual(report["source_failures"][0]["source"], "橘鸦AI早报")
+
+    def test_blocks_required_source_failure_when_configured(self):
+        report = evaluate_quality_gate(
+            [news_item("a")],
+            source_results=[
+                SourceResult(
+                    source="AI HOT",
+                    role="primary",
+                    ok=False,
+                    status="timeout",
+                    error="TimeoutError",
+                )
+            ],
+            reports={"stage3": {"candidate_group_count": 1}},
+            config={
+                "block_on_required_source_failure": True,
+                "required_sources": ["AI HOT"],
+            },
+        )
+
+        self.assertIn("required_source_failed:AI HOT:timeout", report["blocking_errors"])
+        self.assertTrue(report["quality_gate_failed"])
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_runner.py
+++ b/tests/test_runner.py
@@ -22,8 +22,128 @@ class RunnerTests(unittest.TestCase):
            run_dir = Path(result["run_dir"])
            self.assertTrue((run_dir / "blog_markdown.md").exists())
            self.assertTrue((run_dir / "run_report.json").exists())
+            for filename in [
+                "stage0_sources.json",
+                "stage1_items.json",
+                "stage2_items.json",
+                "stage2_5_items.json",
+                "stage2_8_candidates.json",
+                "stage3_items.json",
+                "stage4_items.json",
+                "quality_gate.json",
+            ]:
+                self.assertTrue((run_dir / filename).exists(), filename)
            self.assertEqual(result["reports"]["stage8"]["status"], "ok")

+    def test_run_daily_report_passes_pipeline_config_to_stage_functions(self):
+        class FakeLlmClient:
+            def chat(self, prompt):
+                payload = json.loads(prompt)
+                if "candidates" in payload:
+                    first_candidate = payload["candidates"][0]["item_ids"]
+                    return json.dumps(
+                        {
+                            "duplicate_groups": [
+                                {
+                                    "keep_id": first_candidate[0],
+                                    "remove_ids": [first_candidate[1]],
+                                    "confidence": "high",
+                                    "reason": "same event",
+                                }
+                            ],
+                            "not_duplicates": [],
+                            "uncertain": [],
+                        }
+                    )
+                if "allowed_sections" in payload:
+                    return json.dumps(
+                        {
+                            "rewrites": [
+                                {
+                                    "id": item["id"],
+                                    "title": item["title_raw"],
+                                    "summary": item["summary_raw"],
+                                    "flags": [],
+                                }
+                                for item in payload["items"]
+                            ]
+                        }
+                    )
+                return json.dumps(
+                    {
+                        "intro": "Daily intro.",
+                        "theme": "Pipeline config.",
+                        "threads": [
+                            {
+                                "title": "Config thread",
+                                "text": "Config values reached the pipeline.",
+                                "item_ids": [payload["items"][0]["id"]],
+                                "kind": "thread",
+                            }
+                        ],
+                        "conclusion": "Done.",
+                    }
+                )
+
+        with TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            pipeline_config = temp_path / "pipeline.json"
+            pipeline_config.write_text(
+                json.dumps(
+                    {
+                        "semantic_dedup_max_deletion_ratio": 0.1,
+                        "rewrite_batch_size": 1,
+                        "cross_day_dedup": {"enabled": False},
+                    }
+                ),
+                encoding="utf-8",
+            )
+            source_config = temp_path / "sources.json"
+            source_config.write_text(
+                json.dumps(
+                    [
+                        {
+                            "name": "AI HOT",
+                            "type": "rss",
+                            "url": "https://feed.example/rss",
+                            "role": "primary",
+                            "priority": 10,
+                            "enabled": True,
+                        }
+                    ]
+                ),
+                encoding="utf-8",
+            )
+
+            def fetch_text(url, timeout):
+                return """<?xml version="1.0"?><rss><channel>
+<item><title>Anthropic launches Claude Code</title><link>https://example.com/a</link><description>Anthropic launches Claude Code for developers.</description></item>
+<item><title>Anthropic launch Claude Code</title><link>https://example.com/b</link><description>Anthropic launch Claude Code for coding.</description></item>
+<item><title>Gemini CLI update</title><link>https://example.com/c</link><description>Google updates Gemini CLI.</description></item>
+</channel></rss>"""
+
+            result = run_daily_report(
+                run_date="2026-06-10",
+                mode="dry-run",
+                source_mode="live",
+                llm_mode="live",
+                out_dir=temp_path / "out",
+                base_url="https://blog.example",
+                sources_path=source_config,
+                pipeline_path=pipeline_config,
+                fetch_text=fetch_text,
+                env={
+                    "LLM_API_KEY": "test-key",
+                    "LLM_BASE_URL": "https://llm.example/v1",
+                    "LLM_MODEL": "test-model",
+                },
+                llm_client_factory=lambda **config: FakeLlmClient(),
+            )
+
+        self.assertTrue(result["reports"]["stage3"]["skipped_for_deletion_ratio"])
+        self.assertEqual(result["reports"]["stage4"]["batch_count"], 3)
+        self.assertIn("quality_gate", result["reports"])
+
    def test_run_daily_report_live_sources_can_use_config_and_fetch_text(self):
        with TemporaryDirectory() as temp_dir:
            out_dir = Path(temp_dir) / "out"
--- a/tests/test_stage0_collect.py
+++ b/tests/test_stage0_collect.py
@@ -1,5 +1,6 @@
 import unittest

+from ai_daily_report.clients import FetchTextError
 from ai_daily_report.collect import collect_sources
 from ai_daily_report.models import SourceConfig

@@ -44,6 +45,18 @@ class Stage0CollectTests(unittest.TestCase):
        self.assertEqual(report["failed_source_count"], 1)
        self.assertEqual(report["raw_item_count"], 1)

+    def test_collect_sources_records_fetch_text_error_metadata(self):
+        configs = [SourceConfig(name="RSS", type="rss", retries=2)]
+
+        def fetcher(config, run_date):
+            raise FetchTextError("http_404", "HTTPError: 404", http_status=404, attempts=1)
+
+        results, report = collect_sources(configs, "2026-06-10", fetcher=fetcher)
+
+        self.assertEqual(results[0].status, "http_404")
+        self.assertEqual(results[0].retry_count, 0)
+        self.assertIn("http_404", report["error_types"]["RSS"])
+

 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_stage0_to_4_pipeline.py
+++ b/tests/test_stage0_to_4_pipeline.py
@@ -6,6 +6,81 @@ from ai_daily_report.models import PublishedUrlEntry, PublishedUrls


 class Stage0To4PipelineTests(unittest.TestCase):
+    def test_run_stage0_to_stage4_passes_semantic_and_rewrite_config(self):
+        configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}]
+        seen = {}
+
+        def fetcher(config, run_date):
+            return [
+                {
+                    "title_raw": "Anthropic launches Claude Code",
+                    "summary_raw": "Anthropic launches Claude Code for developers.",
+                    "url": "https://example.com/a",
+                    "source_label": config.name,
+                },
+                {
+                    "title_raw": "Anthropic launch Claude Code",
+                    "summary_raw": "Anthropic launch Claude Code for coding.",
+                    "url": "https://example.com/b",
+                    "source_label": config.name,
+                },
+                {
+                    "title_raw": "Gemini CLI update",
+                    "summary_raw": "Google updates Gemini CLI.",
+                    "url": "https://example.com/c",
+                    "source_label": config.name,
+                },
+            ]
+
+        def semantic_llm_call(prompt):
+            payload = json.loads(prompt)
+            seen["semantic_prompt"] = payload
+            first_candidate = payload["candidates"][0]["item_ids"]
+            return json.dumps(
+                {
+                    "duplicate_groups": [
+                        {
+                            "keep_id": first_candidate[0],
+                            "remove_ids": [first_candidate[1]],
+                            "confidence": "high",
+                            "reason": "same event",
+                        }
+                    ],
+                    "not_duplicates": [],
+                    "uncertain": [],
+                }
+            )
+
+        def rewrite_llm_call(prompt):
+            payload = json.loads(prompt)
+            seen.setdefault("rewrite_batches", []).append(len(payload["items"]))
+            return json.dumps(
+                {
+                    "rewrites": [
+                        {
+                            "id": item["id"],
+                            "title": item["title_raw"],
+                            "summary": item["summary_raw"],
+                            "flags": [],
+                        }
+                        for item in payload["items"]
+                    ]
+                }
+            )
+
+        result = run_stage0_to_stage4(
+            configs,
+            "2026-06-10",
+            fetcher=fetcher,
+            semantic_llm_call=semantic_llm_call,
+            rewrite_llm_call=rewrite_llm_call,
+            semantic_dedup_max_deletion_ratio=0.1,
+            rewrite_batch_size=1,
+        )
+
+        self.assertTrue(result["reports"]["stage3"]["skipped_for_deletion_ratio"])
+        self.assertEqual(seen["rewrite_batches"], [1, 1, 1])
+
    def test_run_stage0_to_stage4_semantic_dedupes_and_rewrites(self):
        configs = [
            {"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10},
@@ -127,6 +202,67 @@ class Stage0To4PipelineTests(unittest.TestCase):
        self.assertEqual(result["reports"]["stage2_5"]["removed_count"], 1)
        self.assertEqual([entry["title_raw"] for entry in seen_rewrite_payloads[0]["items"]], ["Fresh story"])

+    def test_run_stage0_to_stage4_uses_stage2_8_recalled_candidates(self):
+        configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}]
+        seen = {}
+
+        def fetcher(config, run_date):
+            return [
+                {
+                    "title_raw": "Anthropic 被曝开发 Claude Fable",
+                    "summary_raw": "Anthropic 正在开发名为 Claude Fable 和 Claude Mythos 的新产品。",
+                    "url": "https://example.com/fable",
+                    "source_label": config.name,
+                },
+                {
+                    "title_raw": "Claude Mythos 进入内部测试",
+                    "summary_raw": "Anthropic 的 Claude Mythos 与 Claude Fable 面向内容生成场景。",
+                    "url": "https://example.com/mythos",
+                    "source_label": config.name,
+                },
+                {
+                    "title_raw": "Google 开源 Gemma 3n",
+                    "summary_raw": "Google 开源 Gemma 3n 模型，面向端侧部署。",
+                    "url": "https://example.com/gemma",
+                    "source_label": config.name,
+                },
+            ]
+
+        def semantic_llm_call(prompt):
+            payload = json.loads(prompt)
+            seen["candidate_count"] = len(payload["candidates"])
+            seen["candidate_reasons"] = [candidate["reason"] for candidate in payload["candidates"]]
+            return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
+
+        def rewrite_llm_call(prompt):
+            payload = json.loads(prompt)
+            return json.dumps(
+                {
+                    "rewrites": [
+                        {
+                            "id": entry["id"],
+                            "title": entry["title_raw"],
+                            "summary": entry["summary_raw"],
+                            "flags": [],
+                        }
+                        for entry in payload["items"]
+                    ]
+                },
+                ensure_ascii=False,
+            )
+
+        result = run_stage0_to_stage4(
+            configs,
+            "2026-06-10",
+            fetcher=fetcher,
+            semantic_llm_call=semantic_llm_call,
+            rewrite_llm_call=rewrite_llm_call,
+        )
+
+        self.assertEqual(seen["candidate_count"], 1)
+        self.assertIn("strong_entity_overlap", seen["candidate_reasons"])
+        self.assertEqual(result["reports"]["stage2_8"]["added_candidate_group_count"], 1)
+

 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_stage8_publish.py
+++ b/tests/test_stage8_publish.py
@@ -7,9 +7,10 @@ from ai_daily_report.publish import load_published_urls, publish_markdown, updat


 class FakeBlogClient:
-    def __init__(self):
+    def __init__(self, existing_post=None):
        self.created_payload = None
        self.published_slug = None
+        self.existing_post = existing_post

    def create_post(self, payload):
        self.created_payload = payload
@@ -18,6 +19,9 @@ class FakeBlogClient:
    def publish_post(self, slug):
        self.published_slug = slug

+    def get_post_by_slug(self, slug):
+        return self.existing_post
+

 class Stage8PublishTests(unittest.TestCase):
    def test_publish_markdown_dry_run_does_not_call_client(self):
@@ -74,6 +78,45 @@ class Stage8PublishTests(unittest.TestCase):
        self.assertEqual(client.published_slug, "ai-2026-06-04")
        self.assertEqual(result.blog_url, "https://blog.example/posts/ai-2026-06-04")

+    def test_publish_markdown_returns_already_published_for_same_slug_and_content(self):
+        markdown = "## 导览\n\n> ok"
+        client = FakeBlogClient(existing_post={"slug": "ai-2026-06-04", "content": markdown})
+
+        result = publish_markdown(
+            title="AI日报 · 2026-06-04",
+            markdown=markdown,
+            tags=["AI日报"],
+            slug="ai-2026-06-04",
+            base_url="https://blog.example",
+            mode="publish",
+            markdown_report={"blocking_errors": []},
+            client=client,
+            idempotency_config={"enabled": True},
+        )
+
+        self.assertEqual(result.status, "already_published")
+        self.assertIsNone(client.created_payload)
+        self.assertIsNone(client.published_slug)
+
+    def test_publish_markdown_blocks_existing_slug_with_different_content(self):
+        client = FakeBlogClient(existing_post={"slug": "ai-2026-06-04", "content": "old"})
+
+        result = publish_markdown(
+            title="AI日报 · 2026-06-04",
+            markdown="new",
+            tags=["AI日报"],
+            slug="ai-2026-06-04",
+            base_url="https://blog.example",
+            mode="publish",
+            markdown_report={"blocking_errors": []},
+            client=client,
+            idempotency_config={"enabled": True},
+        )
+
+        self.assertEqual(result.status, "blocked")
+        self.assertIn("slug_already_exists", result.error)
+        self.assertIsNone(client.created_payload)
+
    def test_update_published_urls_writes_canonical_urls_for_final_items(self):
        with TemporaryDirectory() as temp_dir:
            history_path = Path(temp_dir) / "published_urls.json"