Files
ai-daily-report/tests/test_candidate_recall.py

80 lines
2.9 KiB
Python

import unittest
from ai_daily_report.candidate_recall import recall_semantic_candidates
from ai_daily_report.models import NewsItem
from ai_daily_report.normalize import normalize_title
def item(item_id, title, summary):
return NewsItem(
id=item_id,
source_group="AI HOT",
source_label="AI HOT",
source_role="primary",
source_priority=10,
title_raw=title,
title_norm=normalize_title(title),
summary_raw=summary,
url=f"https://example.com/{item_id}",
canonical_url=f"https://example.com/{item_id}",
)
class CandidateRecallTests(unittest.TestCase):
def test_recalls_shared_event_entities_when_titles_are_not_stage2_similar(self):
items = [
item(
"a",
"Anthropic 被曝开发 Claude Fable",
"Anthropic 正在开发名为 Claude Fable 和 Claude Mythos 的新产品。",
),
item(
"b",
"Claude Mythos 进入内部测试",
"Anthropic 的 Claude Mythos 与 Claude Fable 面向内容生成场景。",
),
item(
"c",
"Gemini CLI 发布更新",
"Google 为 Gemini CLI 增加新的开发者命令。",
),
]
candidates, report = recall_semantic_candidates(items, existing_candidates=[])
candidate_sets = [set(candidate["item_ids"]) for candidate in candidates]
self.assertIn({"a", "b"}, candidate_sets)
self.assertNotIn({"a", "c"}, candidate_sets)
self.assertEqual(report["candidate_group_count"], 1)
self.assertEqual(candidates[0]["reason"], "strong_entity_overlap")
def test_does_not_group_same_company_different_products_without_event_overlap(self):
items = [
item("gemini", "Google 发布 Gemini CLI", "Google 发布面向开发者的 Gemini CLI 工具。"),
item("gemma", "Google 开源 Gemma 3n", "Google 开源 Gemma 3n 模型,面向端侧部署。"),
]
candidates, report = recall_semantic_candidates(items, existing_candidates=[])
self.assertEqual(candidates, [])
self.assertEqual(report["candidate_group_count"], 0)
def test_preserves_existing_candidates_and_adds_new_ones_without_duplicates(self):
items = [
item("a", "Anthropic 发布 Claude Fable", "Claude Fable 与 Claude Mythos 同时曝光。"),
item("b", "Claude Mythos 新功能曝光", "Claude Mythos 和 Claude Fable 是 Anthropic 新项目。"),
]
candidates, report = recall_semantic_candidates(
items,
existing_candidates=[{"item_ids": ["a", "b"], "reason": "title_similarity"}],
)
self.assertEqual(len(candidates), 1)
self.assertEqual(candidates[0]["reason"], "title_similarity")
self.assertEqual(report["existing_candidate_group_count"], 1)
if __name__ == "__main__":
unittest.main()