ai-daily-report/tests/test_runner.py

import unittest
import json
from pathlib import Path
from tempfile import TemporaryDirectory

from ai_daily_report.publish import load_published_urls
from ai_daily_report.runner import run_daily_report


class RunnerTests(unittest.TestCase):
    def test_run_daily_report_mock_mode_writes_markdown_and_reports(self):
        with TemporaryDirectory() as temp_dir:
            result = run_daily_report(
                run_date="2026-06-04",
                mode="dry-run",
                source_mode="mock",
                llm_mode="mock",
                out_dir=Path(temp_dir),
                base_url="https://blog.example",
            )

            run_dir = Path(result["run_dir"])
            self.assertTrue((run_dir / "blog_markdown.md").exists())
            self.assertTrue((run_dir / "run_report.json").exists())
            for filename in [
                "stage0_sources.json",
                "stage1_items.json",
                "stage2_items.json",
                "stage2_5_items.json",
                "stage2_8_candidates.json",
                "stage3_items.json",
                "stage4_items.json",
                "quality_gate.json",
            ]:
                self.assertTrue((run_dir / filename).exists(), filename)
            self.assertEqual(result["reports"]["stage8"]["status"], "ok")

    def test_run_daily_report_passes_pipeline_config_to_stage_functions(self):
        class FakeLlmClient:
            def chat(self, prompt):
                payload = json.loads(prompt)
                if "candidates" in payload:
                    first_candidate = payload["candidates"][0]["item_ids"]
                    return json.dumps(
                        {
                            "duplicate_groups": [
                                {
                                    "keep_id": first_candidate[0],
                                    "remove_ids": [first_candidate[1]],
                                    "confidence": "high",
                                    "reason": "same event",
                                }
                            ],
                            "not_duplicates": [],
                            "uncertain": [],
                        }
                    )
                if "allowed_sections" in payload:
                    return json.dumps(
                        {
                            "rewrites": [
                                {
                                    "id": item["id"],
                                    "title": item["title_raw"],
                                    "summary": item["summary_raw"],
                                    "flags": [],
                                }
                                for item in payload["items"]
                            ]
                        }
                    )
                return json.dumps(
                    {
                        "intro": "Daily intro.",
                        "theme": "Pipeline config.",
                        "threads": [
                            {
                                "title": "Config thread",
                                "text": "Config values reached the pipeline.",
                                "item_ids": [payload["items"][0]["id"]],
                                "kind": "thread",
                            }
                        ],
                        "conclusion": "Done.",
                    }
                )

        with TemporaryDirectory() as temp_dir:
            temp_path = Path(temp_dir)
            pipeline_config = temp_path / "pipeline.json"
            pipeline_config.write_text(
                json.dumps(
                    {
                        "semantic_dedup_max_deletion_ratio": 0.1,
                        "rewrite_batch_size": 1,
                        "cross_day_dedup": {"enabled": False},
                    }
                ),
                encoding="utf-8",
            )
            source_config = temp_path / "sources.json"
            source_config.write_text(
                json.dumps(
                    [
                        {
                            "name": "AI HOT",
                            "type": "rss",
                            "url": "https://feed.example/rss",
                            "role": "primary",
                            "priority": 10,
                            "enabled": True,
                        }
                    ]
                ),
                encoding="utf-8",
            )

            def fetch_text(url, timeout):
                return """<?xml version="1.0"?><rss><channel>
<item><title>Anthropic launches Claude Code</title><link>https://example.com/a</link><description>Anthropic launches Claude Code for developers.</description></item>
<item><title>Anthropic launch Claude Code</title><link>https://example.com/b</link><description>Anthropic launch Claude Code for coding.</description></item>
<item><title>Gemini CLI update</title><link>https://example.com/c</link><description>Google updates Gemini CLI.</description></item>
</channel></rss>"""

            result = run_daily_report(
                run_date="2026-06-10",
                mode="dry-run",
                source_mode="live",
                llm_mode="live",
                out_dir=temp_path / "out",
                base_url="https://blog.example",
                sources_path=source_config,
                pipeline_path=pipeline_config,
                fetch_text=fetch_text,
                env={
                    "LLM_API_KEY": "test-key",
                    "LLM_BASE_URL": "https://llm.example/v1",
                    "LLM_MODEL": "test-model",
                },
                llm_client_factory=lambda **config: FakeLlmClient(),
            )

        self.assertTrue(result["reports"]["stage3"]["skipped_for_deletion_ratio"])
        self.assertEqual(result["reports"]["stage4"]["batch_count"], 3)
        self.assertIn("quality_gate", result["reports"])

    def test_run_daily_report_live_sources_can_use_config_and_fetch_text(self):
        with TemporaryDirectory() as temp_dir:
            out_dir = Path(temp_dir) / "out"
            source_config = Path(temp_dir) / "sources.json"
            source_config.write_text(
                json.dumps(
                    [
                        {
                            "name": "InfoQ AI",
                            "type": "rss",
                            "url": "https://feed.example/rss",
                            "role": "supplement",
                            "priority": 40,
                            "enabled": True,
                        }
                    ]
                ),
                encoding="utf-8",
            )

            def fetch_text(url, timeout):
                return """<?xml version="1.0"?><rss><channel><item><title>GPT-5 API 发布</title><link>https://example.com/gpt5</link><description>OpenAI 发布 GPT-5 API。</description></item></channel></rss>"""

            result = run_daily_report(
                run_date="2026-06-04",
                mode="dry-run",
                source_mode="live",
                llm_mode="mock",
                out_dir=out_dir,
                base_url="https://blog.example",
                sources_path=source_config,
                fetch_text=fetch_text,
            )

            self.assertEqual(result["reports"]["stage0"]["raw_item_count"], 1)
            self.assertTrue((out_dir / "2026-06-04" / "blog_markdown.md").exists())

    def test_run_daily_report_live_llm_uses_env_config_in_dry_run(self):
        class FakeLlmClient:
            def __init__(self):
                self.prompts = []

            def chat(self, prompt):
                self.prompts.append(prompt)
                if "duplicate_groups" in prompt:
                    return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
                if "rewrites" in prompt:
                    payload = json.loads(prompt)
                    return json.dumps(
                        {
                            "rewrites": [
                                {
                                    "id": item["id"],
                                    "title": item["title_raw"],
                                    "summary": item["summary_raw"],
                                    "flags": [],
                                }
                                for item in payload["items"]
                            ]
                        }
                    )
                return json.dumps(
                    {
                        "theme": "模型能力继续进入产品入口。",
                        "threads": [
                            {
                                "title": "模型 API 更新",
                                "text": "GPT-5 API 发布，说明模型能力继续进入产品入口。",
                                "item_ids": [json.loads(prompt)["items"][0]["id"]],
                                "kind": "thread",
                            }
                        ],
                    }
                )

        fake_client = FakeLlmClient()
        captured_config = {}

        def llm_client_factory(**config):
            captured_config.update(config)
            return fake_client

        with TemporaryDirectory() as temp_dir:
            result = run_daily_report(
                run_date="2026-06-04",
                mode="dry-run",
                source_mode="mock",
                llm_mode="live",
                out_dir=Path(temp_dir),
                base_url="https://blog.example",
                env={
                    "LLM_API_KEY": "test-key",
                    "LLM_BASE_URL": "https://llm.example/v1",
                    "LLM_MODEL": "test-model",
                },
                llm_client_factory=llm_client_factory,
            )

        self.assertEqual(captured_config["api_key"], "test-key")
        self.assertEqual(captured_config["base_url"], "https://llm.example/v1")
        self.assertEqual(captured_config["model"], "test-model")
        self.assertGreaterEqual(len(fake_client.prompts), 2)
        self.assertEqual(result["reports"]["stage8"]["status"], "ok")

    def test_run_daily_report_publish_updates_published_url_history(self):
        class FakeBlogClient:
            def __init__(self, **kwargs):
                self.kwargs = kwargs

            def create_post(self, payload):
                return {"slug": payload["slug"]}

            def publish_post(self, slug):
                self.slug = slug

        with TemporaryDirectory() as temp_dir:
            history_path = Path(temp_dir) / "published_urls.json"
            result = run_daily_report(
                run_date="2026-06-08",
                mode="publish",
                source_mode="mock",
                llm_mode="mock",
                out_dir=Path(temp_dir) / "out",
                base_url="https://blog.example",
                env={"BLOG_SERVICE_TOKEN": "token"},
                blog_client_factory=FakeBlogClient,
                history_path=history_path,
            )
            history = load_published_urls(history_path)

        self.assertEqual(result["reports"]["stage8"]["status"], "ok")
        self.assertIn("https://example.com/gpt5", history.urls)
        self.assertEqual(history.urls["https://example.com/gpt5"].last_published, "2026-06-08")


if __name__ == "__main__":
    unittest.main()