fix: add cross-day dedupe
This commit is contained in:
@@ -16,6 +16,12 @@ class ConfigLoadingTests(unittest.TestCase):
|
||||
self.assertEqual(configs[0].name, "AI HOT")
|
||||
self.assertEqual(configs[0].type, "aihot")
|
||||
|
||||
def test_rss_configs_can_set_max_item_age_days(self):
|
||||
configs = load_source_configs(ROOT / "config" / "sources.json")
|
||||
by_name = {config.name: config for config in configs}
|
||||
|
||||
self.assertEqual(by_name["InfoQ AI"].max_item_age_days, 3)
|
||||
|
||||
def test_all_configured_source_types_are_registered(self):
|
||||
configs = load_source_configs(ROOT / "config" / "sources.json")
|
||||
|
||||
|
||||
58
tests/test_rss.py
Normal file
58
tests/test_rss.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import unittest
|
||||
|
||||
from ai_daily_report.models import SourceConfig
|
||||
from ai_daily_report.sources.rss import parse_rss_items
|
||||
|
||||
|
||||
class RssSourceTests(unittest.TestCase):
|
||||
def test_parse_rss_items_filters_entries_older_than_configured_age(self):
|
||||
config = SourceConfig(
|
||||
name="InfoQ AI",
|
||||
type="rss",
|
||||
url="https://feed.example/rss",
|
||||
max_item_age_days=3,
|
||||
)
|
||||
xml = """<?xml version="1.0"?>
|
||||
<rss><channel>
|
||||
<item>
|
||||
<title>Fresh item</title>
|
||||
<link>https://example.com/fresh</link>
|
||||
<description>Fresh summary</description>
|
||||
<pubDate>Sun, 07 Jun 2026 06:25:00 GMT</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title>Old item</title>
|
||||
<link>https://example.com/old</link>
|
||||
<description>Old summary</description>
|
||||
<pubDate>Mon, 01 Jun 2026 06:25:00 GMT</pubDate>
|
||||
</item>
|
||||
</channel></rss>"""
|
||||
|
||||
items = parse_rss_items(config, xml, run_date="2026-06-08")
|
||||
|
||||
self.assertEqual([item["title_raw"] for item in items], ["Fresh item"])
|
||||
|
||||
def test_parse_rss_items_keeps_unparseable_dates_to_avoid_false_drops(self):
|
||||
config = SourceConfig(
|
||||
name="InfoQ AI",
|
||||
type="rss",
|
||||
url="https://feed.example/rss",
|
||||
max_item_age_days=3,
|
||||
)
|
||||
xml = """<?xml version="1.0"?>
|
||||
<rss><channel>
|
||||
<item>
|
||||
<title>No date item</title>
|
||||
<link>https://example.com/no-date</link>
|
||||
<description>No date summary</description>
|
||||
<pubDate>not a date</pubDate>
|
||||
</item>
|
||||
</channel></rss>"""
|
||||
|
||||
items = parse_rss_items(config, xml, run_date="2026-06-08")
|
||||
|
||||
self.assertEqual([item["title_raw"] for item in items], ["No date item"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -3,6 +3,7 @@ import json
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from ai_daily_report.publish import load_published_urls
|
||||
from ai_daily_report.runner import run_daily_report
|
||||
|
||||
|
||||
@@ -127,6 +128,36 @@ class RunnerTests(unittest.TestCase):
|
||||
self.assertGreaterEqual(len(fake_client.prompts), 2)
|
||||
self.assertEqual(result["reports"]["stage8"]["status"], "ok")
|
||||
|
||||
def test_run_daily_report_publish_updates_published_url_history(self):
|
||||
class FakeBlogClient:
|
||||
def __init__(self, **kwargs):
|
||||
self.kwargs = kwargs
|
||||
|
||||
def create_post(self, payload):
|
||||
return {"slug": payload["slug"]}
|
||||
|
||||
def publish_post(self, slug):
|
||||
self.slug = slug
|
||||
|
||||
with TemporaryDirectory() as temp_dir:
|
||||
history_path = Path(temp_dir) / "published_urls.json"
|
||||
result = run_daily_report(
|
||||
run_date="2026-06-08",
|
||||
mode="publish",
|
||||
source_mode="mock",
|
||||
llm_mode="mock",
|
||||
out_dir=Path(temp_dir) / "out",
|
||||
base_url="https://blog.example",
|
||||
env={"BLOG_SERVICE_TOKEN": "token"},
|
||||
blog_client_factory=FakeBlogClient,
|
||||
history_path=history_path,
|
||||
)
|
||||
history = load_published_urls(history_path)
|
||||
|
||||
self.assertEqual(result["reports"]["stage8"]["status"], "ok")
|
||||
self.assertIn("https://example.com/gpt5", history.urls)
|
||||
self.assertEqual(history.urls["https://example.com/gpt5"].last_published, "2026-06-08")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -2,6 +2,7 @@ import json
|
||||
import unittest
|
||||
|
||||
from ai_daily_report.pipeline import run_stage0_to_stage4
|
||||
from ai_daily_report.models import PublishedUrlEntry, PublishedUrls
|
||||
|
||||
|
||||
class Stage0To4PipelineTests(unittest.TestCase):
|
||||
@@ -61,6 +62,71 @@ class Stage0To4PipelineTests(unittest.TestCase):
|
||||
self.assertIn("stage4", result["reports"])
|
||||
self.assertEqual(result["reports"]["stage4"]["rewritten_count"], 2)
|
||||
|
||||
def test_run_stage0_to_stage4_filters_published_urls_before_semantic_dedupe(self):
|
||||
configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}]
|
||||
seen_semantic_payloads = []
|
||||
seen_rewrite_payloads = []
|
||||
|
||||
def fetcher(config, run_date):
|
||||
return [
|
||||
{
|
||||
"title_raw": "Already published",
|
||||
"summary_raw": "Old summary",
|
||||
"url": "https://example.com/already",
|
||||
"source_label": config.name,
|
||||
},
|
||||
{
|
||||
"title_raw": "Fresh story",
|
||||
"summary_raw": "Fresh summary",
|
||||
"url": "https://example.com/fresh",
|
||||
"source_label": config.name,
|
||||
},
|
||||
]
|
||||
|
||||
def semantic_llm_call(prompt):
|
||||
seen_semantic_payloads.append(json.loads(prompt))
|
||||
return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
|
||||
|
||||
def rewrite_llm_call(prompt):
|
||||
payload = json.loads(prompt)
|
||||
seen_rewrite_payloads.append(payload)
|
||||
return json.dumps(
|
||||
{
|
||||
"rewrites": [
|
||||
{
|
||||
"id": entry["id"],
|
||||
"title": entry["title_raw"],
|
||||
"summary": entry["summary_raw"],
|
||||
"flags": [],
|
||||
}
|
||||
for entry in payload["items"]
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
published_urls = PublishedUrls(
|
||||
urls={
|
||||
"https://example.com/already": PublishedUrlEntry(
|
||||
first_seen="2026-06-07",
|
||||
last_published="2026-06-07",
|
||||
titles=["Already published"],
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
result = run_stage0_to_stage4(
|
||||
configs,
|
||||
"2026-06-08",
|
||||
fetcher=fetcher,
|
||||
semantic_llm_call=semantic_llm_call,
|
||||
rewrite_llm_call=rewrite_llm_call,
|
||||
published_urls=published_urls,
|
||||
)
|
||||
|
||||
self.assertEqual([entry.title_raw for entry in result["items"]], ["Fresh story"])
|
||||
self.assertEqual(result["reports"]["stage2_5"]["removed_count"], 1)
|
||||
self.assertEqual([entry["title_raw"] for entry in seen_rewrite_payloads[0]["items"]], ["Fresh story"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import unittest
|
||||
|
||||
from ai_daily_report.dedupe import hard_dedup_items
|
||||
from ai_daily_report.models import NewsItem
|
||||
from ai_daily_report.dedupe import cross_day_dedup_items, hard_dedup_items
|
||||
from ai_daily_report.models import NewsItem, PublishedUrlEntry, PublishedUrls
|
||||
|
||||
|
||||
def item(
|
||||
@@ -58,6 +58,72 @@ class Stage2DedupeTests(unittest.TestCase):
|
||||
self.assertEqual(len(report["possible_duplicates"]), 1)
|
||||
self.assertEqual(set(report["possible_duplicates"][0]["item_ids"]), {"a", "b"})
|
||||
|
||||
def test_hard_dedup_marks_lower_similarity_mixed_language_titles_as_candidates(self):
|
||||
items = [
|
||||
item("a", "OpenAI custom chip lead Clive Chan joins Anthropic", "openai定制芯片核心成员clivechan跳槽至anthropic", "https://example.com/a", "https://example.com/a"),
|
||||
item("b", "OpenAI chip core member defects to Anthropic before mass production", "openai芯片核心叛逃anthropic就在量产前夜", "https://example.com/b", "https://example.com/b"),
|
||||
]
|
||||
|
||||
deduped, report = hard_dedup_items(items)
|
||||
|
||||
self.assertEqual(len(deduped), 2)
|
||||
self.assertEqual(report["removed_count"], 0)
|
||||
self.assertEqual(len(report["possible_duplicates"]), 1)
|
||||
self.assertEqual(set(report["possible_duplicates"][0]["item_ids"]), {"a", "b"})
|
||||
|
||||
def test_cross_day_dedup_filters_recently_published_canonical_urls_only(self):
|
||||
items = [
|
||||
item("old", "Old URL", "oldurl", "https://example.com/old", "https://example.com/old"),
|
||||
item("new", "New URL", "newurl", "https://example.com/new", "https://example.com/new"),
|
||||
item("missing", "Missing URL", "missingurl", "", ""),
|
||||
]
|
||||
published_urls = PublishedUrls(
|
||||
urls={
|
||||
"https://example.com/old": PublishedUrlEntry(
|
||||
first_seen="2026-06-07",
|
||||
last_published="2026-06-07",
|
||||
titles=["Old URL"],
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
deduped, report = cross_day_dedup_items(
|
||||
items,
|
||||
published_urls,
|
||||
run_date="2026-06-08",
|
||||
max_age_days=7,
|
||||
)
|
||||
|
||||
self.assertEqual([entry.id for entry in deduped], ["new", "missing"])
|
||||
self.assertEqual(report["input_count"], 3)
|
||||
self.assertEqual(report["output_count"], 2)
|
||||
self.assertEqual(report["removed_count"], 1)
|
||||
self.assertEqual(report["removed"][0]["item_id"], "old")
|
||||
|
||||
def test_cross_day_dedup_ignores_urls_outside_history_window(self):
|
||||
items = [
|
||||
item("stale", "Stale URL", "staleurl", "https://example.com/stale", "https://example.com/stale"),
|
||||
]
|
||||
published_urls = PublishedUrls(
|
||||
urls={
|
||||
"https://example.com/stale": PublishedUrlEntry(
|
||||
first_seen="2026-05-01",
|
||||
last_published="2026-05-01",
|
||||
titles=["Stale URL"],
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
deduped, report = cross_day_dedup_items(
|
||||
items,
|
||||
published_urls,
|
||||
run_date="2026-06-08",
|
||||
max_age_days=7,
|
||||
)
|
||||
|
||||
self.assertEqual([entry.id for entry in deduped], ["stale"])
|
||||
self.assertEqual(report["removed_count"], 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from ai_daily_report.publish import publish_markdown
|
||||
from ai_daily_report.models import NewsItem
|
||||
from ai_daily_report.publish import load_published_urls, publish_markdown, update_published_urls
|
||||
|
||||
|
||||
class FakeBlogClient:
|
||||
@@ -71,6 +74,46 @@ class Stage8PublishTests(unittest.TestCase):
|
||||
self.assertEqual(client.published_slug, "ai-2026-06-04")
|
||||
self.assertEqual(result.blog_url, "https://blog.example/posts/ai-2026-06-04")
|
||||
|
||||
def test_update_published_urls_writes_canonical_urls_for_final_items(self):
|
||||
with TemporaryDirectory() as temp_dir:
|
||||
history_path = Path(temp_dir) / "published_urls.json"
|
||||
items = [
|
||||
NewsItem(
|
||||
id="a",
|
||||
source_group="AI HOT",
|
||||
source_label="AI HOT",
|
||||
source_role="primary",
|
||||
source_priority=10,
|
||||
title_raw="Fresh story",
|
||||
title_norm="freshstory",
|
||||
summary_raw="summary",
|
||||
url="https://example.com/fresh?utm_source=x",
|
||||
canonical_url="https://example.com/fresh",
|
||||
title="Fresh story",
|
||||
),
|
||||
NewsItem(
|
||||
id="missing",
|
||||
source_group="AI HOT",
|
||||
source_label="AI HOT",
|
||||
source_role="primary",
|
||||
source_priority=10,
|
||||
title_raw="Missing URL",
|
||||
title_norm="missingurl",
|
||||
summary_raw="summary",
|
||||
url="",
|
||||
canonical_url="",
|
||||
),
|
||||
]
|
||||
|
||||
update_published_urls(history_path, items, run_date="2026-06-08", max_age_days=7)
|
||||
loaded = load_published_urls(history_path)
|
||||
|
||||
self.assertIn("https://example.com/fresh", loaded.urls)
|
||||
self.assertNotIn("", loaded.urls)
|
||||
self.assertEqual(loaded.urls["https://example.com/fresh"].first_seen, "2026-06-08")
|
||||
self.assertEqual(loaded.urls["https://example.com/fresh"].last_published, "2026-06-08")
|
||||
self.assertEqual(loaded.urls["https://example.com/fresh"].titles, ["Fresh story"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user