Refactor AI daily report pipeline

2026-06-04 15:21:56 +08:00
parent 94e18ce22d
commit 5a98696255
64 changed files with 4778 additions and 1316 deletions
--- a/tests/fixtures/.gitkeep
+++ b/tests/fixtures/.gitkeep
@@ -0,0 +1 @@
+
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -0,0 +1,47 @@
+import unittest
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+from ai_daily_report.cli import build_parser, main
+
+
+class CliTests(unittest.TestCase):
+    def test_run_command_parses_date_and_mode(self):
+        parser = build_parser()
+
+        args = parser.parse_args(["run", "--date", "2026-06-04", "--mode", "dry-run", "--source-mode", "live", "--llm-mode", "live", "--sources-path", "config/sources.json"])
+
+        self.assertEqual(args.command, "run")
+        self.assertEqual(args.date, "2026-06-04")
+        self.assertEqual(args.mode, "dry-run")
+        self.assertEqual(args.source_mode, "live")
+        self.assertEqual(args.llm_mode, "live")
+        self.assertEqual(args.sources_path, "config/sources.json")
+
+    def test_main_returns_zero_for_parseable_command(self):
+        self.assertEqual(main(["run", "--date", "2026-06-04", "--mode", "dry-run"]), 0)
+
+    def test_main_mock_run_writes_outputs(self):
+        with TemporaryDirectory() as temp_dir:
+            exit_code = main(
+                [
+                    "run",
+                    "--date",
+                    "2026-06-04",
+                    "--mode",
+                    "dry-run",
+                    "--source-mode",
+                    "mock",
+                    "--llm-mode",
+                    "mock",
+                    "--out-dir",
+                    temp_dir,
+                ]
+            )
+
+            self.assertEqual(exit_code, 0)
+            self.assertTrue((Path(temp_dir) / "2026-06-04" / "blog_markdown.md").exists())
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_clients.py
+++ b/tests/test_clients.py
@@ -0,0 +1,47 @@
+import json
+import unittest
+from unittest.mock import patch
+
+from ai_daily_report.clients import BlogApiClient, OpenAICompatibleClient, fetch_text
+
+
+class FakeResponse:
+    status = 200
+
+    def __init__(self, body):
+        self.body = body
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        return False
+
+    def read(self):
+        return self.body
+
+
+class ClientTests(unittest.TestCase):
+    def test_fetch_text_decodes_response(self):
+        with patch("urllib.request.urlopen", return_value=FakeResponse("ok".encode("utf-8"))):
+            self.assertEqual(fetch_text("https://example.com", 1), "ok")
+
+    def test_openai_compatible_client_returns_message_content(self):
+        body = json.dumps({"choices": [{"message": {"content": "hello"}}]}).encode("utf-8")
+        with patch("urllib.request.urlopen", return_value=FakeResponse(body)):
+            client = OpenAICompatibleClient(api_key="key", base_url="https://llm.example/v1", model="model")
+            self.assertEqual(client.chat("prompt"), "hello")
+
+    def test_blog_api_client_create_and_publish(self):
+        responses = [
+            FakeResponse(json.dumps({"slug": "ai-2026-06-04"}).encode("utf-8")),
+            FakeResponse(json.dumps({"ok": True}).encode("utf-8")),
+        ]
+        with patch("urllib.request.urlopen", side_effect=responses):
+            client = BlogApiClient(base_url="https://blog.example", token="token")
+            self.assertEqual(client.create_post({"title": "t"})["slug"], "ai-2026-06-04")
+            client.publish_post("ai-2026-06-04")
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_config_loading.py
+++ b/tests/test_config_loading.py
@@ -0,0 +1,27 @@
+import unittest
+from pathlib import Path
+
+from ai_daily_report.config import load_source_configs
+from ai_daily_report.sources.registry import get_source_fetcher
+
+
+ROOT = Path(__file__).resolve().parents[1]
+
+
+class ConfigLoadingTests(unittest.TestCase):
+    def test_load_source_configs_from_json(self):
+        configs = load_source_configs(ROOT / "config" / "sources.json")
+
+        self.assertGreaterEqual(len(configs), 5)
+        self.assertEqual(configs[0].name, "AI HOT")
+        self.assertEqual(configs[0].type, "aihot")
+
+    def test_all_configured_source_types_are_registered(self):
+        configs = load_source_configs(ROOT / "config" / "sources.json")
+
+        for config in configs:
+            self.assertTrue(callable(get_source_fetcher(config.type)))
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_dry_run_config.py
+++ b/tests/test_dry_run_config.py
@@ -0,0 +1,33 @@
+import importlib.util
+import unittest
+from pathlib import Path
+
+
+ROOT = Path(__file__).resolve().parents[1]
+SCRIPT = ROOT / "script" / "ai_daily_blog_pipeline.py"
+
+
+def load_pipeline_module():
+    spec = importlib.util.spec_from_file_location("ai_daily_blog_pipeline", SCRIPT)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+class DryRunConfigTests(unittest.TestCase):
+    def test_dry_run_does_not_require_blog_token(self):
+        module = load_pipeline_module()
+
+        self.assertTrue(module.is_dry_run({"AI_DAILY_DRY_RUN": "1"}))
+        self.assertFalse(module.requires_blog_token({"AI_DAILY_DRY_RUN": "1"}))
+
+    def test_publish_mode_requires_blog_token(self):
+        module = load_pipeline_module()
+
+        self.assertFalse(module.is_dry_run({}))
+        self.assertTrue(module.requires_blog_token({}))
+
+
+if __name__ == "__main__":
+    unittest.main()
+
--- a/tests/test_env_config.py
+++ b/tests/test_env_config.py
@@ -0,0 +1,87 @@
+import unittest
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+from ai_daily_report.env import resolve_blog_token, resolve_llm_config
+
+
+class EnvConfigTests(unittest.TestCase):
+    def test_resolve_llm_config_prefers_generic_values(self):
+        config = resolve_llm_config(
+            {
+                "LLM_API_KEY": "generic-key",
+                "LLM_BASE_URL": "https://generic.example/v1",
+                "LLM_MODEL": "generic-model",
+                "SUB2API_API_KEY": "sub-key",
+                "SUB2API_BASE_URL": "https://sub.example/v1",
+                "SUB2API_MODEL": "sub-model",
+            }
+        )
+
+        self.assertEqual(
+            config,
+            {
+                "api_key": "generic-key",
+                "base_url": "https://generic.example/v1",
+                "model": "generic-model",
+            },
+        )
+
+    def test_resolve_llm_config_reports_missing_fields(self):
+        with self.assertRaisesRegex(ValueError, "missing_llm_config: LLM_BASE_URL,LLM_MODEL"):
+            resolve_llm_config({"LLM_API_KEY": "key"})
+
+    def test_resolve_llm_config_follows_hermes_provider_config(self):
+        with TemporaryDirectory() as temp_dir:
+            hermes_dir = Path(temp_dir)
+            (hermes_dir / "config.yaml").write_text(
+                """
+model:
+  provider: sub2api
+  default: findmini/gpt-5.5
+  base_url: http://sub2api.example/v1
+""".strip(),
+                encoding="utf-8",
+            )
+            (hermes_dir / ".env").write_text("SUB2API_API_KEY=hermes-key\n", encoding="utf-8")
+
+            config = resolve_llm_config({}, hermes_dir=hermes_dir)
+
+        self.assertEqual(
+            config,
+            {
+                "api_key": "hermes-key",
+                "base_url": "http://sub2api.example/v1",
+                "model": "findmini/gpt-5.5",
+            },
+        )
+
+    def test_resolve_llm_config_uses_hermes_auth_json_env_source(self):
+        with TemporaryDirectory() as temp_dir:
+            hermes_dir = Path(temp_dir)
+            (hermes_dir / "config.yaml").write_text(
+                """
+model:
+  provider: sub2api
+  default: findmini/gpt-5.5
+  base_url: http://sub2api.example/v1
+""".strip(),
+                encoding="utf-8",
+            )
+            (hermes_dir / "auth.json").write_text(
+                '{"credential_pool": {"sub2api": [{"source": "env:SUB2API_API_KEY"}]}}',
+                encoding="utf-8",
+            )
+
+            config = resolve_llm_config({"SUB2API_API_KEY": "auth-env-key"}, hermes_dir=hermes_dir)
+
+        self.assertEqual(config["api_key"], "auth-env-key")
+        self.assertEqual(config["base_url"], "http://sub2api.example/v1")
+        self.assertEqual(config["model"], "findmini/gpt-5.5")
+
+    def test_resolve_blog_token_uses_supported_names(self):
+        self.assertEqual(resolve_blog_token({"EPHRON_SERVICE_TOKEN": "token"}), "token")
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_env_loading.py
+++ b/tests/test_env_loading.py
@@ -0,0 +1,39 @@
+import importlib.util
+import os
+import unittest
+from pathlib import Path
+from unittest.mock import patch
+
+
+ROOT = Path(__file__).resolve().parents[1]
+SCRIPT = ROOT / "script" / "ai_daily_blog_pipeline.py"
+
+
+def load_pipeline_module():
+    spec = importlib.util.spec_from_file_location("ai_daily_blog_pipeline", SCRIPT)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+class EnvLoadingTests(unittest.TestCase):
+    def test_project_env_is_loaded_and_process_env_wins(self):
+        module = load_pipeline_module()
+        env_text = "LLM_MODEL=file-model\nLLM_BASE_URL=https://file.example/v1\n"
+
+        with patch.object(module.Path, "home", return_value=ROOT / "missing-home"):
+            with patch.dict(os.environ, {"LLM_MODEL": "process-model"}, clear=False):
+                with patch.object(module, "PROJECT_ENV_PATH", ROOT / ".env.test"):
+                    (ROOT / ".env.test").write_text(env_text, encoding="utf-8")
+                    try:
+                        env = module.load_env()
+                    finally:
+                        (ROOT / ".env.test").unlink(missing_ok=True)
+
+        self.assertEqual(env["LLM_BASE_URL"], "https://file.example/v1")
+        self.assertEqual(env["LLM_MODEL"], "process-model")
+
+
+if __name__ == "__main__":
+    unittest.main()
+
--- a/tests/test_legacy_script_delegation.py
+++ b/tests/test_legacy_script_delegation.py
@@ -0,0 +1,57 @@
+import importlib.util
+import unittest
+from pathlib import Path
+from unittest.mock import patch
+
+
+ROOT = Path(__file__).resolve().parents[1]
+SCRIPT = ROOT / "script" / "ai_daily_blog_pipeline.py"
+
+
+def load_pipeline_module():
+    spec = importlib.util.spec_from_file_location("ai_daily_blog_pipeline", SCRIPT)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+class LegacyScriptDelegationTests(unittest.TestCase):
+    def test_main_delegates_to_new_pipeline_by_default(self):
+        module = load_pipeline_module()
+        calls = []
+
+        def fake_run_daily_report(**kwargs):
+            calls.append(kwargs)
+            return {"reports": {"stage8": {"status": "ok"}}}
+
+        with patch.object(module, "load_env", return_value={"AI_DAILY_DRY_RUN": "1"}):
+            with patch("ai_daily_report.runner.run_daily_report", side_effect=fake_run_daily_report):
+                module.main()
+
+        self.assertEqual(len(calls), 1)
+        self.assertEqual(calls[0]["mode"], "dry-run")
+        self.assertEqual(calls[0]["source_mode"], "live")
+        self.assertEqual(calls[0]["llm_mode"], "live")
+
+    def test_main_allows_mock_modes_for_local_test(self):
+        module = load_pipeline_module()
+        calls = []
+
+        def fake_run_daily_report(**kwargs):
+            calls.append(kwargs)
+            return {"reports": {"stage8": {"status": "ok"}}}
+
+        with patch.object(
+            module,
+            "load_env",
+            return_value={"AI_DAILY_DRY_RUN": "1", "AI_DAILY_SOURCE_MODE": "mock", "AI_DAILY_LLM_MODE": "mock"},
+        ):
+            with patch("ai_daily_report.runner.run_daily_report", side_effect=fake_run_daily_report):
+                module.main()
+
+        self.assertEqual(calls[0]["source_mode"], "mock")
+        self.assertEqual(calls[0]["llm_mode"], "mock")
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_llm_utils.py
+++ b/tests/test_llm_utils.py
@@ -0,0 +1,17 @@
+import unittest
+
+from ai_daily_report.llm import parse_json_object
+
+
+class LlmUtilsTests(unittest.TestCase):
+    def test_parse_json_object_strips_markdown_fence(self):
+        self.assertEqual(parse_json_object('```json\n{"ok": true}\n```'), {"ok": True})
+
+    def test_parse_json_object_raises_without_json(self):
+        with self.assertRaises(ValueError):
+            parse_json_object("not json")
+
+
+if __name__ == "__main__":
+    unittest.main()
+
--- a/tests/test_markdown_rendering.py
+++ b/tests/test_markdown_rendering.py
@@ -0,0 +1,39 @@
+import unittest
+
+from ai_daily_report.assemble import assemble_markdown
+from ai_daily_report.models import NewsItem
+
+
+class MarkdownRenderingTests(unittest.TestCase):
+    def test_blog_markdown_strips_double_blockquote_and_reference_markers(self):
+        items = [
+            NewsItem(
+                id="a",
+                source_group="AI HOT",
+                source_label="OpenAI：Blog",
+                source_role="primary",
+                source_priority=10,
+                title_raw="测试模型发布",
+                title_norm="测试模型发布",
+                summary_raw="测试摘要",
+                title="测试模型发布",
+                summary="测试摘要",
+                url="https://openai.com/blog/test",
+                canonical_url="https://openai.com/blog/test",
+                section="模型与能力",
+            )
+        ]
+        guide = {"theme": "> 主线判断：测试主线[1]", "threads": []}
+
+        md, _ = assemble_markdown(items, guide)
+
+        self.assertIn("## 导览", md)
+        self.assertIn("## 模型与能力", md)
+        self.assertIn("[OpenAI：Blog ↗](https://openai.com/blog/test)", md)
+        self.assertNotIn("> >", md)
+        self.assertNotIn("[1]", md)
+        self.assertNotIn("主线判断", md)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_project_structure.py
+++ b/tests/test_project_structure.py
@@ -0,0 +1,33 @@
+import unittest
+from pathlib import Path
+
+
+ROOT = Path(__file__).resolve().parents[1]
+
+
+class ProjectStructureTests(unittest.TestCase):
+    def test_pipeline_plan_structure_exists(self):
+        expected_paths = [
+            "ai_daily_report/sources/__init__.py",
+            "ai_daily_report/sources/aihot.py",
+            "ai_daily_report/sources/rss.py",
+            "ai_daily_report/sources/juya.py",
+            "ai_daily_report/sources/registry.py",
+            "ai_daily_report/llm.py",
+            "ai_daily_report/validate.py",
+            "ai_daily_report/publish.py",
+            "ai_daily_report/cli.py",
+            "config/sources.json",
+            "config/pipeline.json",
+            "tests/fixtures/.gitkeep",
+            "skill/scripts/.gitkeep",
+            "skill/scripts/run_daily_report.py",
+        ]
+
+        missing = [path for path in expected_paths if not (ROOT / path).exists()]
+
+        self.assertEqual(missing, [])
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_runner.py
+++ b/tests/test_runner.py
@@ -0,0 +1,132 @@
+import unittest
+import json
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+from ai_daily_report.runner import run_daily_report
+
+
+class RunnerTests(unittest.TestCase):
+    def test_run_daily_report_mock_mode_writes_markdown_and_reports(self):
+        with TemporaryDirectory() as temp_dir:
+            result = run_daily_report(
+                run_date="2026-06-04",
+                mode="dry-run",
+                source_mode="mock",
+                llm_mode="mock",
+                out_dir=Path(temp_dir),
+                base_url="https://blog.example",
+            )
+
+            run_dir = Path(result["run_dir"])
+            self.assertTrue((run_dir / "blog_markdown.md").exists())
+            self.assertTrue((run_dir / "run_report.json").exists())
+            self.assertEqual(result["reports"]["stage8"]["status"], "ok")
+
+    def test_run_daily_report_live_sources_can_use_config_and_fetch_text(self):
+        with TemporaryDirectory() as temp_dir:
+            out_dir = Path(temp_dir) / "out"
+            source_config = Path(temp_dir) / "sources.json"
+            source_config.write_text(
+                json.dumps(
+                    [
+                        {
+                            "name": "InfoQ AI",
+                            "type": "rss",
+                            "url": "https://feed.example/rss",
+                            "role": "supplement",
+                            "priority": 40,
+                            "enabled": True,
+                        }
+                    ]
+                ),
+                encoding="utf-8",
+            )
+
+            def fetch_text(url, timeout):
+                return """<?xml version="1.0"?><rss><channel><item><title>GPT-5 API 发布</title><link>https://example.com/gpt5</link><description>OpenAI 发布 GPT-5 API。</description></item></channel></rss>"""
+
+            result = run_daily_report(
+                run_date="2026-06-04",
+                mode="dry-run",
+                source_mode="live",
+                llm_mode="mock",
+                out_dir=out_dir,
+                base_url="https://blog.example",
+                sources_path=source_config,
+                fetch_text=fetch_text,
+            )
+
+            self.assertEqual(result["reports"]["stage0"]["raw_item_count"], 1)
+            self.assertTrue((out_dir / "2026-06-04" / "blog_markdown.md").exists())
+
+    def test_run_daily_report_live_llm_uses_env_config_in_dry_run(self):
+        class FakeLlmClient:
+            def __init__(self):
+                self.prompts = []
+
+            def chat(self, prompt):
+                self.prompts.append(prompt)
+                if "duplicate_groups" in prompt:
+                    return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
+                if "rewrites" in prompt:
+                    payload = json.loads(prompt)
+                    return json.dumps(
+                        {
+                            "rewrites": [
+                                {
+                                    "id": item["id"],
+                                    "title": item["title_raw"],
+                                    "summary": item["summary_raw"],
+                                    "flags": [],
+                                }
+                                for item in payload["items"]
+                            ]
+                        }
+                    )
+                return json.dumps(
+                    {
+                        "theme": "模型能力继续进入产品入口。",
+                        "threads": [
+                            {
+                                "title": "模型 API 更新",
+                                "text": "GPT-5 API 发布，说明模型能力继续进入产品入口。",
+                                "item_ids": [json.loads(prompt)["items"][0]["id"]],
+                                "kind": "thread",
+                            }
+                        ],
+                    }
+                )
+
+        fake_client = FakeLlmClient()
+        captured_config = {}
+
+        def llm_client_factory(**config):
+            captured_config.update(config)
+            return fake_client
+
+        with TemporaryDirectory() as temp_dir:
+            result = run_daily_report(
+                run_date="2026-06-04",
+                mode="dry-run",
+                source_mode="mock",
+                llm_mode="live",
+                out_dir=Path(temp_dir),
+                base_url="https://blog.example",
+                env={
+                    "LLM_API_KEY": "test-key",
+                    "LLM_BASE_URL": "https://llm.example/v1",
+                    "LLM_MODEL": "test-model",
+                },
+                llm_client_factory=llm_client_factory,
+            )
+
+        self.assertEqual(captured_config["api_key"], "test-key")
+        self.assertEqual(captured_config["base_url"], "https://llm.example/v1")
+        self.assertEqual(captured_config["model"], "test-model")
+        self.assertGreaterEqual(len(fake_client.prompts), 2)
+        self.assertEqual(result["reports"]["stage8"]["status"], "ok")
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_source_labels.py
+++ b/tests/test_source_labels.py
@@ -0,0 +1,55 @@
+import unittest
+
+from ai_daily_report.models import SourceConfig
+from ai_daily_report.sources.juya import parse_juya_rss
+from ai_daily_report.sources.labels import source_label_from_url
+
+
+class SourceLabelTests(unittest.TestCase):
+    def test_source_label_from_x_url_includes_handle(self):
+        self.assertEqual(
+            source_label_from_url("https://x.com/MiniMax_AI/status/123", fallback="橘鸦AI早报"),
+            "X：MiniMax (@MiniMax_AI)",
+        )
+
+    def test_source_label_from_blog_url_marks_blog(self):
+        self.assertEqual(
+            source_label_from_url("https://openai.com/blog/example", fallback="橘鸦AI早报"),
+            "OpenAI：Blog",
+        )
+
+    def test_source_label_from_known_non_blog_domains(self):
+        self.assertEqual(
+            source_label_from_url("https://mp.weixin.qq.com/s/example", fallback="橘鸦AI早报"),
+            "微信公众号",
+        )
+        self.assertEqual(
+            source_label_from_url("https://platform.minimaxi.com/docs/token-plan/migration", fallback="橘鸦AI早报"),
+            "MiniMax：Docs",
+        )
+
+    def test_parse_juya_rss_uses_item_url_as_source_label(self):
+        config = SourceConfig(name="橘鸦AI早报", type="juya_rss", url="https://juya.example/rss")
+        xml = """<?xml version="1.0"?>
+<rss xmlns:content="http://purl.org/rss/1.0/modules/content/">
+  <channel>
+    <item>
+      <title>2026-06-04</title>
+      <content:encoded><![CDATA[
+        <h2><a href="https://x.com/MiniMax_AI/status/123">MiniMax M3 加速</a> <code>#1</code></h2>
+        <p>MiniMax M3 加速。</p>
+        <p><a href="https://x.com/MiniMax_AI/status/123">来源</a></p>
+        <hr/>
+      ]]></content:encoded>
+    </item>
+  </channel>
+</rss>"""
+
+        items = parse_juya_rss(config, xml, "2026-06-04")
+
+        self.assertEqual(items[0]["source_label"], "X：MiniMax (@MiniMax_AI)")
+        self.assertNotEqual(items[0]["source_label"], "橘鸦AI早报")
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_stage0_collect.py
+++ b/tests/test_stage0_collect.py
@@ -0,0 +1,49 @@
+import unittest
+
+from ai_daily_report.collect import collect_sources
+from ai_daily_report.models import SourceConfig
+
+
+class Stage0CollectTests(unittest.TestCase):
+    def test_collect_sources_returns_structured_results_for_each_source(self):
+        configs = [
+            SourceConfig(name="Primary", type="fake", role="primary", priority=10),
+            SourceConfig(name="Supplement", type="fake", role="supplement", priority=20),
+        ]
+
+        def fetcher(config, run_date):
+            return [{"title_raw": f"{config.name} item", "url": f"https://example.com/{config.name}"}]
+
+        results, report = collect_sources(configs, "2026-06-04", fetcher=fetcher)
+
+        self.assertEqual([r.source for r in results], ["Primary", "Supplement"])
+        self.assertTrue(all(r.ok for r in results))
+        self.assertEqual(sum(len(r.items) for r in results), 2)
+        self.assertEqual(report["input_source_count"], 2)
+        self.assertEqual(report["ok_source_count"], 2)
+        self.assertEqual(report["raw_item_count"], 2)
+
+    def test_collect_sources_records_failed_source_without_blocking_others(self):
+        configs = [
+            SourceConfig(name="Broken", type="fake", role="supplement", priority=20),
+            SourceConfig(name="Healthy", type="fake", role="supplement", priority=30),
+        ]
+
+        def fetcher(config, run_date):
+            if config.name == "Broken":
+                raise TimeoutError("timed out")
+            return [{"title_raw": "healthy item", "url": "https://example.com/healthy"}]
+
+        results, report = collect_sources(configs, "2026-06-04", fetcher=fetcher)
+
+        by_source = {r.source: r for r in results}
+        self.assertFalse(by_source["Broken"].ok)
+        self.assertEqual(by_source["Broken"].status, "timeout")
+        self.assertIn("TimeoutError", by_source["Broken"].error)
+        self.assertTrue(by_source["Healthy"].ok)
+        self.assertEqual(report["failed_source_count"], 1)
+        self.assertEqual(report["raw_item_count"], 1)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_stage0_to_2_pipeline.py
+++ b/tests/test_stage0_to_2_pipeline.py
@@ -0,0 +1,32 @@
+import unittest
+
+from ai_daily_report.pipeline import run_stage0_to_stage2
+
+
+class Stage0To2PipelineTests(unittest.TestCase):
+    def test_run_stage0_to_stage2_returns_deduped_items_and_reports(self):
+        configs = [
+            {"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10},
+            {"name": "RSS", "type": "fake", "role": "supplement", "priority": 50},
+        ]
+
+        def fetcher(config, run_date):
+            return [
+                {
+                    "title_raw": "OpenAI 发布 GPT-5",
+                    "summary_raw": f"{config.name} summary",
+                    "url": "https://openai.com/blog/gpt-5?utm_source=test",
+                    "source_label": config.name,
+                }
+            ]
+
+        result = run_stage0_to_stage2(configs, "2026-06-04", fetcher=fetcher)
+
+        self.assertEqual(len(result["items"]), 1)
+        self.assertEqual(result["reports"]["stage0"]["raw_item_count"], 2)
+        self.assertEqual(result["reports"]["stage1"]["output_count"], 2)
+        self.assertEqual(result["reports"]["stage2"]["removed_count"], 1)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_stage0_to_4_pipeline.py
+++ b/tests/test_stage0_to_4_pipeline.py
@@ -0,0 +1,66 @@
+import json
+import unittest
+
+from ai_daily_report.pipeline import run_stage0_to_stage4
+
+
+class Stage0To4PipelineTests(unittest.TestCase):
+    def test_run_stage0_to_stage4_semantic_dedupes_and_rewrites(self):
+        configs = [
+            {"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10},
+            {"name": "RSS", "type": "fake", "role": "supplement", "priority": 50},
+        ]
+
+        def fetcher(config, run_date):
+            return [
+                {
+                    "title_raw": f"{config.name} Anthropic IPO",
+                    "summary_raw": f"{config.name} reports Anthropic IPO filing.",
+                    "url": f"https://example.com/{config.name}",
+                    "source_label": config.name,
+                }
+            ]
+
+        def semantic_llm_call(prompt):
+            return json.dumps(
+                {
+                    "duplicate_groups": [],
+                    "not_duplicates": [],
+                    "uncertain": [],
+                }
+            )
+
+        def rewrite_llm_call(prompt):
+            payload = json.loads(prompt)
+            return json.dumps(
+                {
+                    "rewrites": [
+                        {
+                            "id": entry["id"],
+                            "title": "Anthropic 提交 IPO 文件",
+                            "summary": "Anthropic 被报道提交 IPO 文件。",
+                            "flags": [],
+                        }
+                        for entry in payload["items"]
+                    ]
+                },
+                ensure_ascii=False,
+            )
+
+        result = run_stage0_to_stage4(
+            configs,
+            "2026-06-04",
+            fetcher=fetcher,
+            semantic_llm_call=semantic_llm_call,
+            rewrite_llm_call=rewrite_llm_call,
+        )
+
+        self.assertEqual(len(result["items"]), 2)
+        self.assertEqual(result["items"][0].title, "Anthropic 提交 IPO 文件")
+        self.assertIn("stage3", result["reports"])
+        self.assertIn("stage4", result["reports"])
+        self.assertEqual(result["reports"]["stage4"]["rewritten_count"], 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_stage0_to_5_pipeline.py
+++ b/tests/test_stage0_to_5_pipeline.py
@@ -0,0 +1,62 @@
+import json
+import unittest
+
+from ai_daily_report.pipeline import run_stage0_to_stage5
+
+
+class Stage0To5PipelineTests(unittest.TestCase):
+    def test_run_stage0_to_stage5_classifies_and_orders_items(self):
+        configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}]
+
+        def fetcher(config, run_date):
+            return [
+                {
+                    "title_raw": "Anthropic 提交 IPO 文件",
+                    "summary_raw": "Anthropic 被报道提交 IPO 文件。",
+                    "url": "https://example.com/ipo",
+                    "source_label": config.name,
+                },
+                {
+                    "title_raw": "GPT-5 API 发布，延迟降低 30%",
+                    "summary_raw": "OpenAI 发布 GPT-5 API。",
+                    "url": "https://example.com/gpt5",
+                    "source_label": config.name,
+                    "section_hint": "模型发布/更新",
+                },
+            ]
+
+        def semantic_llm_call(prompt):
+            return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
+
+        def rewrite_llm_call(prompt):
+            payload = json.loads(prompt)
+            return json.dumps(
+                {
+                    "rewrites": [
+                        {
+                            "id": entry["id"],
+                            "title": entry["title_raw"],
+                            "summary": entry["summary_raw"],
+                            "flags": [],
+                        }
+                        for entry in payload["items"]
+                    ]
+                },
+                ensure_ascii=False,
+            )
+
+        result = run_stage0_to_stage5(
+            configs,
+            "2026-06-04",
+            fetcher=fetcher,
+            semantic_llm_call=semantic_llm_call,
+            rewrite_llm_call=rewrite_llm_call,
+        )
+
+        self.assertEqual([item.section for item in result["items"]], ["模型与能力", "公司与资本"])
+        self.assertEqual(result["reports"]["stage5"]["section_counts"]["模型与能力"], 1)
+        self.assertEqual(result["reports"]["stage5"]["section_counts"]["公司与资本"], 1)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_stage0_to_6_pipeline.py
+++ b/tests/test_stage0_to_6_pipeline.py
@@ -0,0 +1,75 @@
+import json
+import unittest
+
+from ai_daily_report.pipeline import run_stage0_to_stage6
+
+
+class Stage0To6PipelineTests(unittest.TestCase):
+    def test_run_stage0_to_stage6_generates_guide(self):
+        configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}]
+
+        def fetcher(config, run_date):
+            return [
+                {
+                    "title_raw": "GPT-5 API 发布",
+                    "summary_raw": "OpenAI 发布 GPT-5 API。",
+                    "url": "https://example.com/gpt5",
+                    "source_label": config.name,
+                    "section_hint": "模型发布/更新",
+                }
+            ]
+
+        def semantic_llm_call(prompt):
+            return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
+
+        def rewrite_llm_call(prompt):
+            payload = json.loads(prompt)
+            return json.dumps(
+                {
+                    "rewrites": [
+                        {
+                            "id": entry["id"],
+                            "title": entry["title_raw"],
+                            "summary": entry["summary_raw"],
+                            "flags": [],
+                        }
+                        for entry in payload["items"]
+                    ]
+                },
+                ensure_ascii=False,
+            )
+
+        def guide_llm_call(prompt):
+            payload = json.loads(prompt)
+            item_id = payload["items"][0]["id"]
+            return json.dumps(
+                {
+                    "theme": "模型 API 能力继续更新。",
+                    "threads": [
+                        {
+                            "title": "模型能力更新",
+                            "text": "GPT-5 API 发布，体现模型能力继续产品化。",
+                            "item_ids": [item_id],
+                            "kind": "thread",
+                        }
+                    ],
+                },
+                ensure_ascii=False,
+            )
+
+        result = run_stage0_to_stage6(
+            configs,
+            "2026-06-04",
+            fetcher=fetcher,
+            semantic_llm_call=semantic_llm_call,
+            rewrite_llm_call=rewrite_llm_call,
+            guide_llm_call=guide_llm_call,
+        )
+
+        self.assertEqual(result["guide"]["theme"], "模型 API 能力继续更新。")
+        self.assertEqual(len(result["guide"]["threads"]), 1)
+        self.assertTrue(result["reports"]["stage6"]["theme_present"])
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_stage0_to_7_pipeline.py
+++ b/tests/test_stage0_to_7_pipeline.py
@@ -0,0 +1,76 @@
+import json
+import unittest
+
+from ai_daily_report.pipeline import run_stage0_to_stage7
+
+
+class Stage0To7PipelineTests(unittest.TestCase):
+    def test_run_stage0_to_stage7_assembles_markdown(self):
+        configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}]
+
+        def fetcher(config, run_date):
+            return [
+                {
+                    "title_raw": "GPT-5 API 发布",
+                    "summary_raw": "OpenAI 发布 GPT-5 API。",
+                    "url": "https://example.com/gpt5",
+                    "source_label": "OpenAI：Blog",
+                    "section_hint": "模型发布/更新",
+                }
+            ]
+
+        def semantic_llm_call(prompt):
+            return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
+
+        def rewrite_llm_call(prompt):
+            payload = json.loads(prompt)
+            return json.dumps(
+                {
+                    "rewrites": [
+                        {
+                            "id": entry["id"],
+                            "title": entry["title_raw"],
+                            "summary": entry["summary_raw"],
+                            "flags": [],
+                        }
+                        for entry in payload["items"]
+                    ]
+                },
+                ensure_ascii=False,
+            )
+
+        def guide_llm_call(prompt):
+            payload = json.loads(prompt)
+            item_id = payload["items"][0]["id"]
+            return json.dumps(
+                {
+                    "theme": "模型 API 能力继续更新。",
+                    "threads": [
+                        {
+                            "title": "模型能力产品化",
+                            "text": "GPT-5 API 发布，说明模型能力继续进入产品入口。",
+                            "item_ids": [item_id],
+                            "kind": "thread",
+                        }
+                    ],
+                },
+                ensure_ascii=False,
+            )
+
+        result = run_stage0_to_stage7(
+            configs,
+            "2026-06-04",
+            fetcher=fetcher,
+            semantic_llm_call=semantic_llm_call,
+            rewrite_llm_call=rewrite_llm_call,
+            guide_llm_call=guide_llm_call,
+        )
+
+        self.assertIn("## 导览", result["markdown"])
+        self.assertIn("## 模型与能力", result["markdown"])
+        self.assertIn("## 今日脉络", result["markdown"])
+        self.assertEqual(result["reports"]["stage7"]["blocking_errors"], [])
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_stage0_to_8_pipeline.py
+++ b/tests/test_stage0_to_8_pipeline.py
@@ -0,0 +1,79 @@
+import json
+import unittest
+
+from ai_daily_report.pipeline import run_stage0_to_stage8
+
+
+class Stage0To8PipelineTests(unittest.TestCase):
+    def test_run_stage0_to_stage8_dry_run_publishes_report(self):
+        configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}]
+
+        def fetcher(config, run_date):
+            return [
+                {
+                    "title_raw": "GPT-5 API 发布",
+                    "summary_raw": "OpenAI 发布 GPT-5 API。",
+                    "url": "https://example.com/gpt5",
+                    "source_label": "OpenAI：Blog",
+                    "section_hint": "模型发布/更新",
+                }
+            ]
+
+        def semantic_llm_call(prompt):
+            return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
+
+        def rewrite_llm_call(prompt):
+            payload = json.loads(prompt)
+            return json.dumps(
+                {
+                    "rewrites": [
+                        {
+                            "id": entry["id"],
+                            "title": entry["title_raw"],
+                            "summary": entry["summary_raw"],
+                            "flags": [],
+                        }
+                        for entry in payload["items"]
+                    ]
+                },
+                ensure_ascii=False,
+            )
+
+        def guide_llm_call(prompt):
+            payload = json.loads(prompt)
+            item_id = payload["items"][0]["id"]
+            return json.dumps(
+                {
+                    "theme": "模型 API 能力继续更新。",
+                    "threads": [
+                        {
+                            "title": "模型能力产品化",
+                            "text": "GPT-5 API 发布，说明模型能力继续进入产品入口。",
+                            "item_ids": [item_id],
+                            "kind": "thread",
+                        }
+                    ],
+                },
+                ensure_ascii=False,
+            )
+
+        result = run_stage0_to_stage8(
+            configs,
+            "2026-06-04",
+            fetcher=fetcher,
+            semantic_llm_call=semantic_llm_call,
+            rewrite_llm_call=rewrite_llm_call,
+            guide_llm_call=guide_llm_call,
+            mode="dry-run",
+            base_url="https://blog.example",
+            client=None,
+        )
+
+        self.assertEqual(result["publish"].status, "ok")
+        self.assertEqual(result["publish"].blog_url, "https://blog.example/posts/ai-2026-06-04")
+        self.assertIn("stage8", result["reports"])
+        self.assertEqual(result["reports"]["stage8"]["status"], "ok")
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_stage1_normalize.py
+++ b/tests/test_stage1_normalize.py
@@ -0,0 +1,85 @@
+import unittest
+
+from ai_daily_report.models import SourceResult
+from ai_daily_report.normalize import canonicalize_url, normalize_items, normalize_title
+
+
+class Stage1NormalizeTests(unittest.TestCase):
+    def test_canonicalize_url_removes_tracking_and_normalizes_x_host(self):
+        url = "HTTPS://Twitter.com/OpenAI/status/123/?utm_source=newsletter&fbclid=abc#fragment"
+
+        self.assertEqual(canonicalize_url(url), "https://x.com/OpenAI/status/123")
+
+    def test_normalize_items_builds_news_items_with_ids_and_norms(self):
+        source_result = SourceResult(
+            source="AI HOT",
+            role="primary",
+            ok=True,
+            status="ok",
+            items=[
+                {
+                    "title_raw": "  GPT-5 发布：速度提升 2x！ ",
+                    "summary_raw": " <p>OpenAI 发布更新。</p> ",
+                    "url": "https://openai.com/blog/gpt-5?utm_campaign=test",
+                    "source_label": "OpenAI：Blog",
+                    "section_hint": "模型发布/更新",
+                }
+            ],
+        )
+
+        items, report = normalize_items([source_result], run_date="2026-06-04")
+
+        self.assertEqual(len(items), 1)
+        self.assertTrue(items[0].id.startswith("item_"))
+        self.assertEqual(items[0].canonical_url, "https://openai.com/blog/gpt-5")
+        self.assertEqual(items[0].title_norm, normalize_title("GPT-5 发布：速度提升 2x！"))
+        self.assertEqual(items[0].summary_raw, "OpenAI 发布更新。")
+        self.assertEqual(items[0].source_role, "primary")
+        self.assertEqual(report["input_count"], 1)
+        self.assertEqual(report["output_count"], 1)
+
+    def test_normalize_items_marks_quality_flags_without_dropping_item(self):
+        source_result = SourceResult(
+            source="RSS",
+            role="supplement",
+            ok=True,
+            status="ok",
+            items=[{"title_raw": "短", "summary_raw": "", "url": ""}],
+        )
+
+        items, report = normalize_items([source_result], run_date="2026-06-04")
+
+        self.assertEqual(len(items), 1)
+        self.assertIn("missing_url", items[0].quality_flags)
+        self.assertIn("missing_summary", items[0].quality_flags)
+        self.assertIn("short_title", items[0].quality_flags)
+        self.assertEqual(report["quality_flag_counts"]["missing_url"], 1)
+
+    def test_normalize_items_keeps_ids_unique_for_same_canonical_url(self):
+        source_result = SourceResult(
+            source="AI HOT",
+            role="primary",
+            ok=True,
+            status="ok",
+            items=[
+                {
+                    "title_raw": "OpenAI 发布 GPT-5",
+                    "summary_raw": "summary a",
+                    "url": "https://example.com/news?utm_source=a",
+                },
+                {
+                    "title_raw": "OpenAI 发布 GPT-5",
+                    "summary_raw": "summary b",
+                    "url": "https://example.com/news",
+                },
+            ],
+        )
+
+        items, _ = normalize_items([source_result], run_date="2026-06-04")
+
+        self.assertEqual(len({item.id for item in items}), 2)
+        self.assertEqual(items[0].canonical_url, items[1].canonical_url)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_stage2_dedupe.py
+++ b/tests/test_stage2_dedupe.py
@@ -0,0 +1,63 @@
+import unittest
+
+from ai_daily_report.dedupe import hard_dedup_items
+from ai_daily_report.models import NewsItem
+
+
+def item(
+    item_id,
+    title,
+    title_norm,
+    url,
+    canonical_url,
+    source_group="AI HOT",
+    source_label="AI HOT",
+    source_priority=100,
+    summary="summary",
+):
+    return NewsItem(
+        id=item_id,
+        source_group=source_group,
+        source_label=source_label,
+        source_role="primary" if source_group == "AI HOT" else "supplement",
+        source_priority=source_priority,
+        title_raw=title,
+        title_norm=title_norm,
+        summary_raw=summary,
+        url=url,
+        canonical_url=canonical_url,
+    )
+
+
+class Stage2DedupeTests(unittest.TestCase):
+    def test_hard_dedup_merges_same_canonical_url_and_keeps_better_item(self):
+        items = [
+            item("a", "OpenAI 发布 GPT-5", "openai发布gpt5", "https://example.com/a?utm_source=x", "https://example.com/a", source_group="RSS", source_priority=50, summary="short"),
+            item("b", "OpenAI 发布 GPT-5", "openai发布gpt5", "https://example.com/a", "https://example.com/a", source_group="AI HOT", source_priority=10, summary="longer summary"),
+        ]
+
+        deduped, report = hard_dedup_items(items)
+
+        self.assertEqual([i.id for i in deduped], ["b"])
+        self.assertEqual(report["input_count"], 2)
+        self.assertEqual(report["output_count"], 1)
+        self.assertEqual(report["removed_count"], 1)
+        self.assertEqual(report["groups"][0]["reason"], "same_canonical_url")
+        self.assertEqual(deduped[0].duplicate_sources[0]["source_group"], "RSS")
+
+    def test_hard_dedup_marks_similar_titles_without_removing(self):
+        items = [
+            item("a", "Grok API 上线 Cloudflare Gateway", "grokapi上线cloudflaregateway", "https://x.com/a", "https://x.com/a"),
+            item("b", "Grok 模型登陆 Cloudflare AI Gateway", "grok模型登陆cloudflareaigateway", "https://x.com/b", "https://x.com/b"),
+        ]
+
+        deduped, report = hard_dedup_items(items)
+
+        self.assertEqual(len(deduped), 2)
+        self.assertEqual(report["removed_count"], 0)
+        self.assertEqual(len(report["possible_duplicates"]), 1)
+        self.assertEqual(set(report["possible_duplicates"][0]["item_ids"]), {"a", "b"})
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_stage3_semantic_dedupe.py
+++ b/tests/test_stage3_semantic_dedupe.py
@@ -0,0 +1,129 @@
+import json
+import unittest
+
+from ai_daily_report.models import NewsItem
+from ai_daily_report.semantic_dedupe import semantic_dedup_items
+
+
+def news_item(item_id, title, source_group="AI HOT"):
+    return NewsItem(
+        id=item_id,
+        source_group=source_group,
+        source_label=source_group,
+        source_role="primary" if source_group == "AI HOT" else "supplement",
+        source_priority=10 if source_group == "AI HOT" else 50,
+        title_raw=title,
+        title_norm=title.lower(),
+        summary_raw=f"{title} summary",
+        url=f"https://example.com/{item_id}",
+        canonical_url=f"https://example.com/{item_id}",
+    )
+
+
+class Stage3SemanticDedupeTests(unittest.TestCase):
+    def test_semantic_dedup_removes_only_high_confidence_duplicates(self):
+        items = [
+            news_item("a", "Anthropic 提交 IPO 招股书", "AI HOT"),
+            news_item("b", "刚刚，Anthropic 提交了招股书", "量子位"),
+            news_item("c", "Grok 上线 Cloudflare Gateway", "AI HOT"),
+        ]
+        candidates = [{"item_ids": ["a", "b"], "reason": "title_similarity"}]
+
+        def llm_call(prompt):
+            return json.dumps(
+                {
+                    "duplicate_groups": [
+                        {
+                            "keep_id": "a",
+                            "remove_ids": ["b"],
+                            "confidence": "high",
+                            "reason": "same IPO filing event",
+                        }
+                    ],
+                    "not_duplicates": [],
+                    "uncertain": [],
+                }
+            )
+
+        deduped, report = semantic_dedup_items(items, candidates, llm_call=llm_call)
+
+        self.assertEqual([item.id for item in deduped], ["a", "c"])
+        self.assertEqual(report["removed_count"], 1)
+        self.assertEqual(report["duplicate_groups"][0]["reason"], "same IPO filing event")
+        self.assertEqual(deduped[0].duplicate_sources[0]["id"], "b")
+
+    def test_semantic_dedup_skips_deletion_when_ratio_exceeds_limit(self):
+        items = [
+            news_item("a", "A"),
+            news_item("b", "B"),
+            news_item("c", "C"),
+        ]
+        candidates = [{"item_ids": ["a", "b", "c"], "reason": "llm_candidate"}]
+
+        def llm_call(prompt):
+            return json.dumps(
+                {
+                    "duplicate_groups": [
+                        {
+                            "keep_id": "a",
+                            "remove_ids": ["b", "c"],
+                            "confidence": "high",
+                            "reason": "too broad",
+                        }
+                    ],
+                    "not_duplicates": [],
+                    "uncertain": [],
+                }
+            )
+
+        deduped, report = semantic_dedup_items(
+            items,
+            candidates,
+            llm_call=llm_call,
+            max_deletion_ratio=0.5,
+        )
+
+        self.assertEqual(len(deduped), 3)
+        self.assertEqual(report["removed_count"], 0)
+        self.assertTrue(report["skipped_for_deletion_ratio"])
+
+    def test_semantic_dedup_ignores_groups_outside_candidate_sets(self):
+        items = [
+            news_item("a", "Suno 完成融资"),
+            news_item("b", "Suno 完成 D 轮融资"),
+            news_item("c", "Ideogram 发布 v4"),
+            news_item("d", "OpenClaw 发布新版"),
+        ]
+        candidates = [{"item_ids": ["a", "b"], "reason": "title_similarity"}]
+
+        def llm_call(prompt):
+            return json.dumps(
+                {
+                    "duplicate_groups": [
+                        {
+                            "keep_id": "a",
+                            "remove_ids": ["b"],
+                            "confidence": "high",
+                            "reason": "same Suno event",
+                        },
+                        {
+                            "keep_id": "c",
+                            "remove_ids": ["d"],
+                            "confidence": "high",
+                            "reason": "not part of candidates",
+                        },
+                    ],
+                    "not_duplicates": [],
+                    "uncertain": [],
+                }
+            )
+
+        deduped, report = semantic_dedup_items(items, candidates, llm_call=llm_call)
+
+        self.assertEqual([item.id for item in deduped], ["a", "c", "d"])
+        self.assertEqual(report["removed_count"], 1)
+        self.assertIn("group_outside_candidates", report["errors"][0])
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_stage4_rewrite.py
+++ b/tests/test_stage4_rewrite.py
@@ -0,0 +1,96 @@
+import json
+import unittest
+
+from ai_daily_report.models import NewsItem
+from ai_daily_report.rewrite import rewrite_items
+
+
+def news_item(item_id="a"):
+    return NewsItem(
+        id=item_id,
+        source_group="AI HOT",
+        source_label="AI HOT",
+        source_role="primary",
+        source_priority=10,
+        title_raw="OpenAI launches GPT-5 API",
+        title_norm="openailaunchesgpt5api",
+        summary_raw="OpenAI launched the GPT-5 API with better latency.",
+        url="https://example.com/a",
+        canonical_url="https://example.com/a",
+    )
+
+
+class Stage4RewriteTests(unittest.TestCase):
+    def test_rewrite_items_writes_display_fields_without_overwriting_raw(self):
+        items = [news_item("a")]
+
+        def llm_call(prompt):
+            return json.dumps(
+                {
+                    "rewrites": [
+                        {
+                            "id": "a",
+                            "title": "OpenAI 发布 GPT-5 API",
+                            "summary": "OpenAI 发布 GPT-5 API，延迟表现更好。",
+                            "flags": [],
+                        }
+                    ]
+                },
+                ensure_ascii=False,
+            )
+
+        rewritten, report = rewrite_items(items, llm_call=llm_call, batch_size=10)
+
+        self.assertEqual(rewritten[0].title, "OpenAI 发布 GPT-5 API")
+        self.assertEqual(rewritten[0].summary, "OpenAI 发布 GPT-5 API，延迟表现更好。")
+        self.assertEqual(rewritten[0].title_raw, "OpenAI launches GPT-5 API")
+        self.assertEqual(report["rewritten_count"], 1)
+        self.assertEqual(report["fallback_count"], 0)
+
+    def test_rewrite_items_falls_back_when_llm_fails(self):
+        items = [news_item("a")]
+
+        def llm_call(prompt):
+            raise TimeoutError("slow")
+
+        rewritten, report = rewrite_items(items, llm_call=llm_call, batch_size=10)
+
+        self.assertEqual(rewritten[0].title, "OpenAI launches GPT-5 API")
+        self.assertEqual(rewritten[0].summary, "OpenAI launched the GPT-5 API with better latency.")
+        self.assertEqual(report["rewritten_count"], 0)
+        self.assertEqual(report["fallback_count"], 1)
+        self.assertIn("TimeoutError", report["errors"][0])
+
+    def test_rewrite_items_retries_failed_batch_as_single_items(self):
+        items = [news_item("a"), news_item("b")]
+        calls = []
+
+        def llm_call(prompt):
+            payload = json.loads(prompt)
+            ids = [item["id"] for item in payload["items"]]
+            calls.append(ids)
+            if len(ids) > 1:
+                return "not json"
+            return json.dumps(
+                {
+                    "rewrites": [
+                        {
+                            "id": ids[0],
+                            "title": f"title {ids[0]}",
+                            "summary": f"summary {ids[0]}",
+                            "flags": [],
+                        }
+                    ]
+                }
+            )
+
+        rewritten, report = rewrite_items(items, llm_call=llm_call, batch_size=2)
+
+        self.assertEqual([item.title for item in rewritten], ["title a", "title b"])
+        self.assertEqual(report["rewritten_count"], 2)
+        self.assertEqual(report["fallback_count"], 0)
+        self.assertEqual(calls, [["a", "b"], ["a"], ["b"]])
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_stage5_classify.py
+++ b/tests/test_stage5_classify.py
@@ -0,0 +1,61 @@
+import unittest
+
+from ai_daily_report.classify import SECTION_ORDER, classify_and_order_items
+from ai_daily_report.models import NewsItem
+
+
+def news_item(item_id, title, summary="", section_hint="", source_priority=50):
+    return NewsItem(
+        id=item_id,
+        source_group="AI HOT",
+        source_label="AI HOT",
+        source_role="primary",
+        source_priority=source_priority,
+        title_raw=title,
+        title_norm=title.lower(),
+        summary_raw=summary or f"{title} summary",
+        title=title,
+        summary=summary or f"{title} summary",
+        url=f"https://example.com/{item_id}",
+        canonical_url=f"https://example.com/{item_id}",
+        section_hint=section_hint,
+    )
+
+
+class Stage5ClassifyTests(unittest.TestCase):
+    def test_classify_maps_legacy_section_hints_to_new_sections(self):
+        items = [news_item("a", "GPT-5 发布", section_hint="模型发布/更新")]
+
+        classified, report = classify_and_order_items(items)
+
+        self.assertEqual(classified[0].section, "模型与能力")
+        self.assertEqual(report["hint_classified"], 1)
+        self.assertIn("模型与能力", SECTION_ORDER)
+
+    def test_classify_uses_rules_when_hint_is_missing(self):
+        items = [
+            news_item("a", "Anthropic 提交 IPO 文件", summary="Anthropic 计划上市并提交文件。"),
+            news_item("b", "MCP SDK 发布新版", summary="开发者可用新版 SDK 构建工具。"),
+        ]
+
+        classified, report = classify_and_order_items(items)
+        by_id = {item.id: item for item in classified}
+
+        self.assertEqual(by_id["a"].section, "公司与资本")
+        self.assertEqual(by_id["b"].section, "开发与基础设施")
+        self.assertEqual(report["rule_classified"], 2)
+
+    def test_classify_orders_items_by_local_rank_score_within_sections(self):
+        items = [
+            news_item("low", "普通模型更新", section_hint="模型发布/更新", source_priority=80),
+            news_item("high", "GPT-5 API 发布，延迟降低 30%", section_hint="模型发布/更新", source_priority=10),
+        ]
+
+        classified, report = classify_and_order_items(items)
+
+        self.assertEqual([item.id for item in classified], ["high", "low"])
+        self.assertEqual(report["section_counts"]["模型与能力"], 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_stage6_guide.py
+++ b/tests/test_stage6_guide.py
@@ -0,0 +1,77 @@
+import json
+import unittest
+
+from ai_daily_report.guide import generate_guide
+from ai_daily_report.models import NewsItem
+
+
+def news_item(item_id, title, section="模型与能力"):
+    return NewsItem(
+        id=item_id,
+        source_group="AI HOT",
+        source_label="AI HOT",
+        source_role="primary",
+        source_priority=10,
+        title_raw=title,
+        title_norm=title.lower(),
+        summary_raw=f"{title} summary",
+        title=title,
+        summary=f"{title} summary",
+        url=f"https://example.com/{item_id}",
+        canonical_url=f"https://example.com/{item_id}",
+        section=section,
+    )
+
+
+class Stage6GuideTests(unittest.TestCase):
+    def test_generate_guide_returns_theme_and_valid_threads(self):
+        items = [
+            news_item("a", "GPT-5 API 发布"),
+            news_item("b", "Miso One 开源语音模型"),
+        ]
+
+        def llm_call(prompt):
+            return json.dumps(
+                {
+                    "theme": "模型能力继续向 API 和实时语音两端推进。",
+                    "threads": [
+                        {
+                            "title": "模型能力继续推进",
+                            "text": "GPT-5 API 和 Miso One 分别代表 API 能力和语音模型更新。",
+                            "item_ids": ["a", "b"],
+                            "kind": "thread",
+                        },
+                        {
+                            "title": "无效脉络",
+                            "text": "这条引用了不存在的条目。",
+                            "item_ids": ["missing"],
+                            "kind": "thread",
+                        },
+                    ],
+                },
+                ensure_ascii=False,
+            )
+
+        guide, report = generate_guide(items, llm_call=llm_call)
+
+        self.assertEqual(guide["theme"], "模型能力继续向 API 和实时语音两端推进。")
+        self.assertEqual(len(guide["threads"]), 1)
+        self.assertEqual(guide["threads"][0]["item_ids"], ["a", "b"])
+        self.assertEqual(report["dropped_thread_count"], 1)
+
+    def test_generate_guide_falls_back_when_llm_fails(self):
+        items = [news_item("a", "GPT-5 API 发布")]
+
+        def llm_call(prompt):
+            raise TimeoutError("slow")
+
+        guide, report = generate_guide(items, llm_call=llm_call)
+
+        self.assertEqual(guide["theme"], "")
+        self.assertEqual(guide["threads"], [])
+        self.assertTrue(report["fallback_used"])
+        self.assertIn("TimeoutError", report["errors"][0])
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_stage7_assemble.py
+++ b/tests/test_stage7_assemble.py
@@ -0,0 +1,65 @@
+import unittest
+
+from ai_daily_report.assemble import assemble_markdown, validate_markdown
+from ai_daily_report.models import NewsItem
+
+
+def news_item(item_id, title, section):
+    return NewsItem(
+        id=item_id,
+        source_group="AI HOT",
+        source_label="OpenAI：Blog",
+        source_role="primary",
+        source_priority=10,
+        title_raw=title,
+        title_norm=title.lower(),
+        summary_raw=f"{title} summary",
+        title=title,
+        summary=f"{title} summary",
+        url=f"https://example.com/{item_id}",
+        canonical_url=f"https://example.com/{item_id}",
+        section=section,
+    )
+
+
+class Stage7AssembleTests(unittest.TestCase):
+    def test_assemble_markdown_renders_sections_and_daily_threads(self):
+        items = [
+            news_item("a", "GPT-5 API 发布", "模型与能力"),
+            news_item("b", "Anthropic 提交 IPO 文件", "公司与资本"),
+        ]
+        guide = {
+            "theme": "> 模型和资本两条线都在推进。[1]",
+            "threads": [
+                {
+                    "title": "模型能力产品化",
+                    "text": "GPT-5 API 发布，说明模型能力继续进入产品入口。",
+                    "item_ids": ["a"],
+                    "kind": "thread",
+                }
+            ],
+        }
+
+        md, report = assemble_markdown(items, guide)
+
+        self.assertIn("## 导览", md)
+        self.assertIn("> 模型和资本两条线都在推进。", md)
+        self.assertIn("## 模型与能力", md)
+        self.assertIn("**1. GPT-5 API 发布**", md)
+        self.assertIn("**2. Anthropic 提交 IPO 文件**", md)
+        self.assertIn("## 今日脉络", md)
+        self.assertIn("- **模型能力产品化**", md)
+        self.assertNotIn("> >", md)
+        self.assertNotIn("[1]", md)
+        self.assertEqual(report["item_count"], 2)
+        self.assertEqual(report["blocking_errors"], [])
+
+    def test_validate_markdown_blocks_empty_report(self):
+        report = validate_markdown("", [])
+
+        self.assertIn("no_items", report["blocking_errors"])
+        self.assertIn("markdown_too_short", report["blocking_errors"])
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_stage8_publish.py
+++ b/tests/test_stage8_publish.py
@@ -0,0 +1,76 @@
+import unittest
+
+from ai_daily_report.publish import publish_markdown
+
+
+class FakeBlogClient:
+    def __init__(self):
+        self.created_payload = None
+        self.published_slug = None
+
+    def create_post(self, payload):
+        self.created_payload = payload
+        return {"slug": "ai-2026-06-04"}
+
+    def publish_post(self, slug):
+        self.published_slug = slug
+
+
+class Stage8PublishTests(unittest.TestCase):
+    def test_publish_markdown_dry_run_does_not_call_client(self):
+        result = publish_markdown(
+            title="AI日报 · 2026-06-04",
+            markdown="## 导览\n\n> ok",
+            tags=["AI日报"],
+            slug="ai-2026-06-04",
+            base_url="https://blog.example",
+            mode="dry-run",
+            markdown_report={"blocking_errors": []},
+            client=None,
+        )
+
+        self.assertEqual(result.status, "ok")
+        self.assertEqual(result.mode, "dry-run")
+        self.assertEqual(result.blog_url, "https://blog.example/posts/ai-2026-06-04")
+        self.assertTrue(result.public_ok)
+
+    def test_publish_markdown_blocks_when_markdown_has_errors(self):
+        client = FakeBlogClient()
+
+        result = publish_markdown(
+            title="AI日报 · 2026-06-04",
+            markdown="bad",
+            tags=["AI日报"],
+            slug="ai-2026-06-04",
+            base_url="https://blog.example",
+            mode="publish",
+            markdown_report={"blocking_errors": ["markdown_too_short"]},
+            client=client,
+        )
+
+        self.assertEqual(result.status, "blocked")
+        self.assertIsNone(client.created_payload)
+        self.assertIn("markdown_too_short", result.error)
+
+    def test_publish_markdown_publish_mode_calls_client(self):
+        client = FakeBlogClient()
+
+        result = publish_markdown(
+            title="AI日报 · 2026-06-04",
+            markdown="## 导览\n\n> ok",
+            tags=["AI日报"],
+            slug="ai-2026-06-04",
+            base_url="https://blog.example",
+            mode="publish",
+            markdown_report={"blocking_errors": []},
+            client=client,
+        )
+
+        self.assertEqual(result.status, "ok")
+        self.assertEqual(client.created_payload["title"], "AI日报 · 2026-06-04")
+        self.assertEqual(client.published_slug, "ai-2026-06-04")
+        self.assertEqual(result.blog_url, "https://blog.example/posts/ai-2026-06-04")
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_validate.py
+++ b/tests/test_validate.py
@@ -0,0 +1,14 @@
+import unittest
+
+from ai_daily_report.validate import validate_report_markdown
+
+
+class ValidateTests(unittest.TestCase):
+    def test_validate_report_markdown_delegates_markdown_checks(self):
+        report = validate_report_markdown("", [])
+
+        self.assertIn("no_items", report["blocking_errors"])
+
+
+if __name__ == "__main__":
+    unittest.main()