Refactor AI daily report pipeline

2026-06-04 15:21:56 +08:00
parent 94e18ce22d
commit 5a98696255
64 changed files with 4778 additions and 1316 deletions
--- a/tests/test_runner.py
+++ b/tests/test_runner.py
@@ -0,0 +1,132 @@
+import unittest
+import json
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+from ai_daily_report.runner import run_daily_report
+
+
+class RunnerTests(unittest.TestCase):
+    def test_run_daily_report_mock_mode_writes_markdown_and_reports(self):
+        with TemporaryDirectory() as temp_dir:
+            result = run_daily_report(
+                run_date="2026-06-04",
+                mode="dry-run",
+                source_mode="mock",
+                llm_mode="mock",
+                out_dir=Path(temp_dir),
+                base_url="https://blog.example",
+            )
+
+            run_dir = Path(result["run_dir"])
+            self.assertTrue((run_dir / "blog_markdown.md").exists())
+            self.assertTrue((run_dir / "run_report.json").exists())
+            self.assertEqual(result["reports"]["stage8"]["status"], "ok")
+
+    def test_run_daily_report_live_sources_can_use_config_and_fetch_text(self):
+        with TemporaryDirectory() as temp_dir:
+            out_dir = Path(temp_dir) / "out"
+            source_config = Path(temp_dir) / "sources.json"
+            source_config.write_text(
+                json.dumps(
+                    [
+                        {
+                            "name": "InfoQ AI",
+                            "type": "rss",
+                            "url": "https://feed.example/rss",
+                            "role": "supplement",
+                            "priority": 40,
+                            "enabled": True,
+                        }
+                    ]
+                ),
+                encoding="utf-8",
+            )
+
+            def fetch_text(url, timeout):
+                return """<?xml version="1.0"?><rss><channel><item><title>GPT-5 API 发布</title><link>https://example.com/gpt5</link><description>OpenAI 发布 GPT-5 API。</description></item></channel></rss>"""
+
+            result = run_daily_report(
+                run_date="2026-06-04",
+                mode="dry-run",
+                source_mode="live",
+                llm_mode="mock",
+                out_dir=out_dir,
+                base_url="https://blog.example",
+                sources_path=source_config,
+                fetch_text=fetch_text,
+            )
+
+            self.assertEqual(result["reports"]["stage0"]["raw_item_count"], 1)
+            self.assertTrue((out_dir / "2026-06-04" / "blog_markdown.md").exists())
+
+    def test_run_daily_report_live_llm_uses_env_config_in_dry_run(self):
+        class FakeLlmClient:
+            def __init__(self):
+                self.prompts = []
+
+            def chat(self, prompt):
+                self.prompts.append(prompt)
+                if "duplicate_groups" in prompt:
+                    return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
+                if "rewrites" in prompt:
+                    payload = json.loads(prompt)
+                    return json.dumps(
+                        {
+                            "rewrites": [
+                                {
+                                    "id": item["id"],
+                                    "title": item["title_raw"],
+                                    "summary": item["summary_raw"],
+                                    "flags": [],
+                                }
+                                for item in payload["items"]
+                            ]
+                        }
+                    )
+                return json.dumps(
+                    {
+                        "theme": "模型能力继续进入产品入口。",
+                        "threads": [
+                            {
+                                "title": "模型 API 更新",
+                                "text": "GPT-5 API 发布，说明模型能力继续进入产品入口。",
+                                "item_ids": [json.loads(prompt)["items"][0]["id"]],
+                                "kind": "thread",
+                            }
+                        ],
+                    }
+                )
+
+        fake_client = FakeLlmClient()
+        captured_config = {}
+
+        def llm_client_factory(**config):
+            captured_config.update(config)
+            return fake_client
+
+        with TemporaryDirectory() as temp_dir:
+            result = run_daily_report(
+                run_date="2026-06-04",
+                mode="dry-run",
+                source_mode="mock",
+                llm_mode="live",
+                out_dir=Path(temp_dir),
+                base_url="https://blog.example",
+                env={
+                    "LLM_API_KEY": "test-key",
+                    "LLM_BASE_URL": "https://llm.example/v1",
+                    "LLM_MODEL": "test-model",
+                },
+                llm_client_factory=llm_client_factory,
+            )
+
+        self.assertEqual(captured_config["api_key"], "test-key")
+        self.assertEqual(captured_config["base_url"], "https://llm.example/v1")
+        self.assertEqual(captured_config["model"], "test-model")
+        self.assertGreaterEqual(len(fake_client.prompts), 2)
+        self.assertEqual(result["reports"]["stage8"]["status"], "ok")
+
+
+if __name__ == "__main__":
+    unittest.main()