ai-daily-report/tests/test_candidate_recall.py

import unittest

from ai_daily_report.candidate_recall import recall_semantic_candidates
from ai_daily_report.models import NewsItem
from ai_daily_report.normalize import normalize_title


def item(item_id, title, summary):
    return NewsItem(
        id=item_id,
        source_group="AI HOT",
        source_label="AI HOT",
        source_role="primary",
        source_priority=10,
        title_raw=title,
        title_norm=normalize_title(title),
        summary_raw=summary,
        url=f"https://example.com/{item_id}",
        canonical_url=f"https://example.com/{item_id}",
    )


class CandidateRecallTests(unittest.TestCase):
    def test_recalls_shared_event_entities_when_titles_are_not_stage2_similar(self):
        items = [
            item(
                "a",
                "Anthropic 被曝开发 Claude Fable",
                "Anthropic 正在开发名为 Claude Fable 和 Claude Mythos 的新产品。",
            ),
            item(
                "b",
                "Claude Mythos 进入内部测试",
                "Anthropic 的 Claude Mythos 与 Claude Fable 面向内容生成场景。",
            ),
            item(
                "c",
                "Gemini CLI 发布更新",
                "Google 为 Gemini CLI 增加新的开发者命令。",
            ),
        ]

        candidates, report = recall_semantic_candidates(items, existing_candidates=[])

        candidate_sets = [set(candidate["item_ids"]) for candidate in candidates]
        self.assertIn({"a", "b"}, candidate_sets)
        self.assertNotIn({"a", "c"}, candidate_sets)
        self.assertEqual(report["candidate_group_count"], 1)
        self.assertEqual(candidates[0]["reason"], "strong_entity_overlap")

    def test_does_not_group_same_company_different_products_without_event_overlap(self):
        items = [
            item("gemini", "Google 发布 Gemini CLI", "Google 发布面向开发者的 Gemini CLI 工具。"),
            item("gemma", "Google 开源 Gemma 3n", "Google 开源 Gemma 3n 模型，面向端侧部署。"),
        ]

        candidates, report = recall_semantic_candidates(items, existing_candidates=[])

        self.assertEqual(candidates, [])
        self.assertEqual(report["candidate_group_count"], 0)

    def test_preserves_existing_candidates_and_adds_new_ones_without_duplicates(self):
        items = [
            item("a", "Anthropic 发布 Claude Fable", "Claude Fable 与 Claude Mythos 同时曝光。"),
            item("b", "Claude Mythos 新功能曝光", "Claude Mythos 和 Claude Fable 是 Anthropic 新项目。"),
        ]

        candidates, report = recall_semantic_candidates(
            items,
            existing_candidates=[{"item_ids": ["a", "b"], "reason": "title_similarity"}],
        )

        self.assertEqual(len(candidates), 1)
        self.assertEqual(candidates[0]["reason"], "title_similarity")
        self.assertEqual(report["existing_candidate_group_count"], 1)


if __name__ == "__main__":
    unittest.main()