80 lines
2.9 KiB
Python
80 lines
2.9 KiB
Python
import unittest
|
|
|
|
from ai_daily_report.candidate_recall import recall_semantic_candidates
|
|
from ai_daily_report.models import NewsItem
|
|
from ai_daily_report.normalize import normalize_title
|
|
|
|
|
|
def item(item_id, title, summary):
|
|
return NewsItem(
|
|
id=item_id,
|
|
source_group="AI HOT",
|
|
source_label="AI HOT",
|
|
source_role="primary",
|
|
source_priority=10,
|
|
title_raw=title,
|
|
title_norm=normalize_title(title),
|
|
summary_raw=summary,
|
|
url=f"https://example.com/{item_id}",
|
|
canonical_url=f"https://example.com/{item_id}",
|
|
)
|
|
|
|
|
|
class CandidateRecallTests(unittest.TestCase):
|
|
def test_recalls_shared_event_entities_when_titles_are_not_stage2_similar(self):
|
|
items = [
|
|
item(
|
|
"a",
|
|
"Anthropic 被曝开发 Claude Fable",
|
|
"Anthropic 正在开发名为 Claude Fable 和 Claude Mythos 的新产品。",
|
|
),
|
|
item(
|
|
"b",
|
|
"Claude Mythos 进入内部测试",
|
|
"Anthropic 的 Claude Mythos 与 Claude Fable 面向内容生成场景。",
|
|
),
|
|
item(
|
|
"c",
|
|
"Gemini CLI 发布更新",
|
|
"Google 为 Gemini CLI 增加新的开发者命令。",
|
|
),
|
|
]
|
|
|
|
candidates, report = recall_semantic_candidates(items, existing_candidates=[])
|
|
|
|
candidate_sets = [set(candidate["item_ids"]) for candidate in candidates]
|
|
self.assertIn({"a", "b"}, candidate_sets)
|
|
self.assertNotIn({"a", "c"}, candidate_sets)
|
|
self.assertEqual(report["candidate_group_count"], 1)
|
|
self.assertEqual(candidates[0]["reason"], "strong_entity_overlap")
|
|
|
|
def test_does_not_group_same_company_different_products_without_event_overlap(self):
|
|
items = [
|
|
item("gemini", "Google 发布 Gemini CLI", "Google 发布面向开发者的 Gemini CLI 工具。"),
|
|
item("gemma", "Google 开源 Gemma 3n", "Google 开源 Gemma 3n 模型,面向端侧部署。"),
|
|
]
|
|
|
|
candidates, report = recall_semantic_candidates(items, existing_candidates=[])
|
|
|
|
self.assertEqual(candidates, [])
|
|
self.assertEqual(report["candidate_group_count"], 0)
|
|
|
|
def test_preserves_existing_candidates_and_adds_new_ones_without_duplicates(self):
|
|
items = [
|
|
item("a", "Anthropic 发布 Claude Fable", "Claude Fable 与 Claude Mythos 同时曝光。"),
|
|
item("b", "Claude Mythos 新功能曝光", "Claude Mythos 和 Claude Fable 是 Anthropic 新项目。"),
|
|
]
|
|
|
|
candidates, report = recall_semantic_candidates(
|
|
items,
|
|
existing_candidates=[{"item_ids": ["a", "b"], "reason": "title_similarity"}],
|
|
)
|
|
|
|
self.assertEqual(len(candidates), 1)
|
|
self.assertEqual(candidates[0]["reason"], "title_similarity")
|
|
self.assertEqual(report["existing_candidate_group_count"], 1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|