diff --git a/ai_daily_report/cli.py b/ai_daily_report/cli.py index 539cbce..6f53720 100644 --- a/ai_daily_report/cli.py +++ b/ai_daily_report/cli.py @@ -17,6 +17,8 @@ def build_parser() -> argparse.ArgumentParser: run.add_argument("--out-dir", default="runs") run.add_argument("--base-url", default="https://blog.ephron.ren") run.add_argument("--sources-path", default=None) + run.add_argument("--pipeline-path", default=None) + run.add_argument("--history-path", default=None) return parser @@ -32,6 +34,8 @@ def main(argv: list[str] | None = None) -> int: out_dir=Path(args.out_dir), base_url=args.base_url, sources_path=Path(args.sources_path) if args.sources_path else None, + pipeline_path=Path(args.pipeline_path) if args.pipeline_path else None, + history_path=Path(args.history_path) if args.history_path else None, ) return 0 diff --git a/ai_daily_report/config.py b/ai_daily_report/config.py index 03b426d..e861c34 100644 --- a/ai_daily_report/config.py +++ b/ai_daily_report/config.py @@ -17,3 +17,12 @@ def load_source_configs(path: Path) -> list[SourceConfig]: if not isinstance(raw, list): raise ValueError("sources config must be a list") return [_source_config_from_dict(item) for item in raw] + + +def load_pipeline_config(path: Path) -> dict[str, Any]: + if not path.exists(): + return {} + raw = load_json(path) + if not isinstance(raw, dict): + raise ValueError("pipeline config must be an object") + return raw diff --git a/ai_daily_report/dedupe.py b/ai_daily_report/dedupe.py index 6a9e426..5b36ca0 100644 --- a/ai_daily_report/dedupe.py +++ b/ai_daily_report/dedupe.py @@ -1,9 +1,16 @@ from __future__ import annotations import difflib +import re +from datetime import date, datetime from typing import Any -from .models import NewsItem +from .models import NewsItem, PublishedUrlEntry, PublishedUrls + + +TITLE_SIMILARITY_THRESHOLD = 0.50 +TOKEN_JACCARD_THRESHOLD = 0.40 +TOKEN_EDIT_DISTANCE_THRESHOLD = 0.40 def _item_score(item: NewsItem) -> int: @@ -52,6 +59,18 @@ def _group_by_key(items: list[NewsItem], key_name: str) -> dict[str, list[NewsIt return {key: group for key, group in groups.items() if len(group) > 1} +def _title_tokens(value: str) -> set[str]: + if not value: + return set() + return set(re.findall(r"[a-z0-9]+|[\u4e00-\u9fff]", value.lower())) + + +def _jaccard_similarity(left: set[str], right: set[str]) -> float: + if not left or not right: + return 0.0 + return len(left & right) / len(left | right) + + def _possible_duplicates(items: list[NewsItem]) -> list[dict[str, Any]]: possible: list[dict[str, Any]] = [] for index, left in enumerate(items): @@ -59,12 +78,16 @@ def _possible_duplicates(items: list[NewsItem]) -> list[dict[str, Any]]: if not left.title_norm or not right.title_norm: continue ratio = difflib.SequenceMatcher(None, left.title_norm, right.title_norm).ratio() - if ratio >= 0.65: + jaccard = _jaccard_similarity(_title_tokens(left.title_norm), _title_tokens(right.title_norm)) + if ratio >= TITLE_SIMILARITY_THRESHOLD or ( + ratio >= TOKEN_EDIT_DISTANCE_THRESHOLD and jaccard >= TOKEN_JACCARD_THRESHOLD + ): possible.append( { "item_ids": [left.id, right.id], "reason": "title_similarity", "similarity": round(ratio, 3), + "token_jaccard": round(jaccard, 3), "confidence": "medium", } ) @@ -98,3 +121,62 @@ def hard_dedup_items(items: list[NewsItem]) -> tuple[list[NewsItem], dict[str, A "possible_duplicates": _possible_duplicates(deduped), } return deduped, report + + +def _parse_date(value: str | None) -> date | None: + if not value: + return None + text = value.strip() + try: + return date.fromisoformat(text[:10]) + except ValueError: + try: + return datetime.fromisoformat(text).date() + except ValueError: + return None + + +def _entry_within_window(entry: PublishedUrlEntry, *, run_date: str, max_age_days: int) -> bool: + if max_age_days < 0: + return True + current = _parse_date(run_date) + previous = _parse_date(entry.last_published) or _parse_date(entry.first_seen) + if current is None or previous is None: + return True + return (current - previous).days <= max_age_days + + +def cross_day_dedup_items( + items: list[NewsItem], + published_urls: PublishedUrls | None, + *, + run_date: str, + max_age_days: int = 7, +) -> tuple[list[NewsItem], dict[str, Any]]: + history = published_urls or PublishedUrls() + deduped: list[NewsItem] = [] + removed: list[dict[str, Any]] = [] + + for item in items: + entry = history.urls.get(item.canonical_url) if item.canonical_url else None + if entry and _entry_within_window(entry, run_date=run_date, max_age_days=max_age_days): + removed.append( + { + "item_id": item.id, + "canonical_url": item.canonical_url, + "title": item.title or item.title_raw, + "first_seen": entry.first_seen, + "last_published": entry.last_published, + } + ) + continue + deduped.append(item) + + report = { + "input_count": len(items), + "output_count": len(deduped), + "removed_count": len(removed), + "removed": removed, + "max_age_days": max_age_days, + } + return deduped, report diff --git a/ai_daily_report/models.py b/ai_daily_report/models.py index 756b629..3a5839e 100644 --- a/ai_daily_report/models.py +++ b/ai_daily_report/models.py @@ -14,6 +14,7 @@ class SourceConfig: retries: int = 0 min_items: int = 0 url: str = "" + max_item_age_days: int | None = None @dataclass @@ -51,3 +52,17 @@ class NewsItem: section: str | None = None quality_flags: list[str] = field(default_factory=list) duplicate_sources: list[dict[str, Any]] = field(default_factory=list) + + +@dataclass +class PublishedUrlEntry: + first_seen: str + last_published: str + titles: list[str] = field(default_factory=list) + + +@dataclass +class PublishedUrls: + version: int = 1 + urls: dict[str, PublishedUrlEntry] = field(default_factory=dict) + updated_at: str = "" diff --git a/ai_daily_report/pipeline.py b/ai_daily_report/pipeline.py index 6f036c5..0abbeaa 100644 --- a/ai_daily_report/pipeline.py +++ b/ai_daily_report/pipeline.py @@ -5,9 +5,9 @@ from typing import Any from .assemble import assemble_markdown from .classify import classify_and_order_items from .collect import Fetcher, collect_sources -from .dedupe import hard_dedup_items +from .dedupe import cross_day_dedup_items, hard_dedup_items from .guide import GuideLlmCall, generate_guide -from .models import SourceConfig +from .models import PublishedUrls, SourceConfig from .normalize import normalize_items from .publish import BlogClient, publish_markdown from .rewrite import RewriteLlmCall, rewrite_items @@ -15,6 +15,7 @@ from .semantic_dedupe import SemanticLlmCall, semantic_dedup_items def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig: + max_item_age_days = value.get("max_item_age_days") return SourceConfig( name=value["name"], type=value["type"], @@ -26,6 +27,7 @@ def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig: retries=int(value.get("retries", 0)), min_items=int(value.get("min_items", 0)), url=value.get("url", ""), + max_item_age_days=int(max_item_age_days) if max_item_age_days is not None else None, ) @@ -58,6 +60,43 @@ def run_stage0_to_stage2( } +def run_stage0_to_stage2_5( + source_configs: list[dict[str, Any] | SourceConfig], + run_date: str, + *, + fetcher: Fetcher, + published_urls: PublishedUrls | None = None, + cross_day_dedup_enabled: bool = True, + cross_day_dedup_max_age_days: int = 7, +) -> dict[str, Any]: + stage2_result = run_stage0_to_stage2(source_configs, run_date, fetcher=fetcher) + if cross_day_dedup_enabled: + items, stage2_5_report = cross_day_dedup_items( + stage2_result["items"], + published_urls, + run_date=run_date, + max_age_days=cross_day_dedup_max_age_days, + ) + else: + items = stage2_result["items"] + stage2_5_report = { + "input_count": len(items), + "output_count": len(items), + "removed_count": 0, + "removed": [], + "enabled": False, + "max_age_days": cross_day_dedup_max_age_days, + } + reports = dict(stage2_result["reports"]) + stage2_5_report.setdefault("enabled", cross_day_dedup_enabled) + reports["stage2_5"] = stage2_5_report + return { + "source_results": stage2_result["source_results"], + "items": items, + "reports": reports, + } + + def run_stage0_to_stage4( source_configs: list[dict[str, Any] | SourceConfig], run_date: str, @@ -65,10 +104,25 @@ def run_stage0_to_stage4( fetcher: Fetcher, semantic_llm_call: SemanticLlmCall, rewrite_llm_call: RewriteLlmCall, + published_urls: PublishedUrls | None = None, + cross_day_dedup_enabled: bool = True, + cross_day_dedup_max_age_days: int = 7, ) -> dict[str, Any]: - stage2_result = run_stage0_to_stage2(source_configs, run_date, fetcher=fetcher) - items = stage2_result["items"] - candidates = stage2_result["reports"]["stage2"].get("possible_duplicates", []) + stage2_5_result = run_stage0_to_stage2_5( + source_configs, + run_date, + fetcher=fetcher, + published_urls=published_urls, + cross_day_dedup_enabled=cross_day_dedup_enabled, + cross_day_dedup_max_age_days=cross_day_dedup_max_age_days, + ) + items = stage2_5_result["items"] + remaining_ids = {item.id for item in items} + candidates = [ + candidate + for candidate in stage2_5_result["reports"]["stage2"].get("possible_duplicates", []) + if set(candidate.get("item_ids", [])).issubset(remaining_ids) + ] semantic_items, stage3_report = semantic_dedup_items( items, candidates, @@ -78,11 +132,11 @@ def run_stage0_to_stage4( semantic_items, llm_call=rewrite_llm_call, ) - reports = dict(stage2_result["reports"]) + reports = dict(stage2_5_result["reports"]) reports["stage3"] = stage3_report reports["stage4"] = stage4_report return { - "source_results": stage2_result["source_results"], + "source_results": stage2_5_result["source_results"], "items": rewritten_items, "reports": reports, } @@ -95,6 +149,9 @@ def run_stage0_to_stage5( fetcher: Fetcher, semantic_llm_call: SemanticLlmCall, rewrite_llm_call: RewriteLlmCall, + published_urls: PublishedUrls | None = None, + cross_day_dedup_enabled: bool = True, + cross_day_dedup_max_age_days: int = 7, ) -> dict[str, Any]: stage4_result = run_stage0_to_stage4( source_configs, @@ -102,6 +159,9 @@ def run_stage0_to_stage5( fetcher=fetcher, semantic_llm_call=semantic_llm_call, rewrite_llm_call=rewrite_llm_call, + published_urls=published_urls, + cross_day_dedup_enabled=cross_day_dedup_enabled, + cross_day_dedup_max_age_days=cross_day_dedup_max_age_days, ) classified_items, stage5_report = classify_and_order_items(stage4_result["items"]) reports = dict(stage4_result["reports"]) @@ -121,6 +181,9 @@ def run_stage0_to_stage6( semantic_llm_call: SemanticLlmCall, rewrite_llm_call: RewriteLlmCall, guide_llm_call: GuideLlmCall, + published_urls: PublishedUrls | None = None, + cross_day_dedup_enabled: bool = True, + cross_day_dedup_max_age_days: int = 7, ) -> dict[str, Any]: stage5_result = run_stage0_to_stage5( source_configs, @@ -128,6 +191,9 @@ def run_stage0_to_stage6( fetcher=fetcher, semantic_llm_call=semantic_llm_call, rewrite_llm_call=rewrite_llm_call, + published_urls=published_urls, + cross_day_dedup_enabled=cross_day_dedup_enabled, + cross_day_dedup_max_age_days=cross_day_dedup_max_age_days, ) guide, stage6_report = generate_guide(stage5_result["items"], llm_call=guide_llm_call) reports = dict(stage5_result["reports"]) @@ -148,6 +214,9 @@ def run_stage0_to_stage7( semantic_llm_call: SemanticLlmCall, rewrite_llm_call: RewriteLlmCall, guide_llm_call: GuideLlmCall, + published_urls: PublishedUrls | None = None, + cross_day_dedup_enabled: bool = True, + cross_day_dedup_max_age_days: int = 7, ) -> dict[str, Any]: stage6_result = run_stage0_to_stage6( source_configs, @@ -156,6 +225,9 @@ def run_stage0_to_stage7( semantic_llm_call=semantic_llm_call, rewrite_llm_call=rewrite_llm_call, guide_llm_call=guide_llm_call, + published_urls=published_urls, + cross_day_dedup_enabled=cross_day_dedup_enabled, + cross_day_dedup_max_age_days=cross_day_dedup_max_age_days, ) markdown, stage7_report = assemble_markdown(stage6_result["items"], stage6_result["guide"]) upstream_blocking_errors: list[str] = [] @@ -187,6 +259,9 @@ def run_stage0_to_stage8( mode: str, base_url: str, client: BlogClient | None, + published_urls: PublishedUrls | None = None, + cross_day_dedup_enabled: bool = True, + cross_day_dedup_max_age_days: int = 7, ) -> dict[str, Any]: stage7_result = run_stage0_to_stage7( source_configs, @@ -195,6 +270,9 @@ def run_stage0_to_stage8( semantic_llm_call=semantic_llm_call, rewrite_llm_call=rewrite_llm_call, guide_llm_call=guide_llm_call, + published_urls=published_urls, + cross_day_dedup_enabled=cross_day_dedup_enabled, + cross_day_dedup_max_age_days=cross_day_dedup_max_age_days, ) slug = f"ai-{run_date}" publish_result = publish_markdown( diff --git a/ai_daily_report/publish.py b/ai_daily_report/publish.py index 7cf3ccd..39b84e6 100644 --- a/ai_daily_report/publish.py +++ b/ai_daily_report/publish.py @@ -1,8 +1,13 @@ from __future__ import annotations +import json from dataclasses import dataclass +from datetime import date, datetime, timezone +from pathlib import Path from typing import Any, Protocol +from .models import NewsItem, PublishedUrlEntry, PublishedUrls + @dataclass class PublishResult: @@ -22,6 +27,122 @@ class BlogClient(Protocol): ... +def _parse_date(value: str | None) -> date | None: + if not value: + return None + text = value.strip() + try: + return date.fromisoformat(text[:10]) + except ValueError: + try: + return datetime.fromisoformat(text).date() + except ValueError: + return None + + +def _published_entry_from_dict(value: Any) -> PublishedUrlEntry | None: + if not isinstance(value, dict): + return None + first_seen = str(value.get("first_seen") or "") + last_published = str(value.get("last_published") or first_seen) + titles = [str(title) for title in value.get("titles", []) or [] if str(title)] + if not first_seen and not last_published: + return None + return PublishedUrlEntry( + first_seen=first_seen or last_published, + last_published=last_published or first_seen, + titles=titles, + ) + + +def load_published_urls(path: Path) -> PublishedUrls: + if not path.exists(): + return PublishedUrls() + try: + raw = json.loads(path.read_text(encoding="utf-8")) + except Exception: + return PublishedUrls() + if not isinstance(raw, dict): + return PublishedUrls() + + urls: dict[str, PublishedUrlEntry] = {} + for canonical_url, value in (raw.get("urls") or {}).items(): + if not canonical_url: + continue + entry = _published_entry_from_dict(value) + if entry is not None: + urls[str(canonical_url)] = entry + return PublishedUrls( + version=int(raw.get("version") or 1), + urls=urls, + updated_at=str(raw.get("updated_at") or ""), + ) + + +def _entry_within_window(entry: PublishedUrlEntry, *, run_date: str, max_age_days: int) -> bool: + if max_age_days < 0: + return True + current = _parse_date(run_date) + previous = _parse_date(entry.last_published) or _parse_date(entry.first_seen) + if current is None or previous is None: + return True + return (current - previous).days <= max_age_days + + +def _published_urls_to_dict(history: PublishedUrls) -> dict[str, Any]: + return { + "version": history.version, + "urls": { + canonical_url: { + "first_seen": entry.first_seen, + "last_published": entry.last_published, + "titles": entry.titles, + } + for canonical_url, entry in sorted(history.urls.items()) + }, + "updated_at": history.updated_at, + } + + +def update_published_urls( + path: Path, + items: list[NewsItem], + *, + run_date: str, + max_age_days: int = 7, +) -> PublishedUrls: + history = load_published_urls(path) + history.urls = { + canonical_url: entry + for canonical_url, entry in history.urls.items() + if _entry_within_window(entry, run_date=run_date, max_age_days=max_age_days) + } + + for item in items: + if not item.canonical_url: + continue + title = item.title or item.title_raw + entry = history.urls.get(item.canonical_url) + if entry is None: + entry = PublishedUrlEntry( + first_seen=run_date, + last_published=run_date, + titles=[], + ) + history.urls[item.canonical_url] = entry + entry.last_published = run_date + if title and title not in entry.titles: + entry.titles.append(title) + + history.updated_at = datetime.now(timezone.utc).isoformat() + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text( + json.dumps(_published_urls_to_dict(history), ensure_ascii=False, indent=2), + encoding="utf-8", + ) + return history + + def dry_run_publish(slug: str, base_url: str) -> PublishResult: return PublishResult( mode="dry-run", diff --git a/ai_daily_report/runner.py b/ai_daily_report/runner.py index 82c7213..293b102 100644 --- a/ai_daily_report/runner.py +++ b/ai_daily_report/runner.py @@ -6,10 +6,11 @@ from pathlib import Path from typing import Any from .clients import BlogApiClient, OpenAICompatibleClient, fetch_text as default_fetch_text -from .config import load_source_configs +from .config import load_pipeline_config, load_source_configs from .env import load_env, resolve_blog_token, resolve_llm_config from .models import SourceConfig from .pipeline import run_stage0_to_stage8 +from .publish import load_published_urls, update_published_urls from .sources.registry import get_source_fetcher @@ -89,6 +90,8 @@ def run_daily_report( out_dir: Path, base_url: str, sources_path: Path | None = None, + pipeline_path: Path | None = None, + history_path: Path | None = None, fetch_text=None, env: dict[str, str] | None = None, llm_client_factory=OpenAICompatibleClient, @@ -96,6 +99,15 @@ def run_daily_report( ) -> dict[str, Any]: fetch_text = fetch_text or default_fetch_text env = env if env is not None else load_env() + pipeline_config_path = pipeline_path or Path("config") / "pipeline.json" + pipeline_config = load_pipeline_config(pipeline_config_path) + cross_day_config = pipeline_config.get("cross_day_dedup", {}) or {} + cross_day_enabled = bool(cross_day_config.get("enabled", True)) + cross_day_max_age_days = int(cross_day_config.get("max_age_days", 7)) + configured_history_path = history_path or Path( + str(cross_day_config.get("history_path") or "~/.hermes/scripts/ai_morning_out/published_urls.json") + ).expanduser() + published_urls = load_published_urls(configured_history_path) if cross_day_enabled else None if source_mode == "mock": source_configs = _mock_source_configs() @@ -141,8 +153,19 @@ def run_daily_report( mode=mode, base_url=base_url, client=blog_client, + published_urls=published_urls, + cross_day_dedup_enabled=cross_day_enabled, + cross_day_dedup_max_age_days=cross_day_max_age_days, ) + if cross_day_enabled and result["publish"].mode == "publish" and result["publish"].status == "ok": + update_published_urls( + configured_history_path, + result["items"], + run_date=run_date, + max_age_days=cross_day_max_age_days, + ) + run_dir = out_dir / run_date run_dir.mkdir(parents=True, exist_ok=True) (run_dir / "blog_markdown.md").write_text(result["markdown"], encoding="utf-8") diff --git a/ai_daily_report/sources/rss.py b/ai_daily_report/sources/rss.py index 1a705f6..67ca496 100644 --- a/ai_daily_report/sources/rss.py +++ b/ai_daily_report/sources/rss.py @@ -1,6 +1,7 @@ from __future__ import annotations import xml.etree.ElementTree as ET +from datetime import date, datetime from email.utils import parsedate_to_datetime from typing import Any, Callable @@ -20,16 +21,57 @@ def _parse_pubdate(value: str) -> str | None: return None -def parse_rss_items(config: SourceConfig, xml_text: str, *, limit: int = 20) -> list[dict[str, Any]]: +def _parse_run_date(value: str | None) -> date | None: + if not value: + return None + try: + return date.fromisoformat(value[:10]) + except ValueError: + return None + + +def _parse_iso_date(value: str | None) -> date | None: + if not value: + return None + try: + return datetime.fromisoformat(value).date() + except ValueError: + return None + + +def _within_max_item_age(published_at: str | None, *, run_date: str | None, max_item_age_days: int | None) -> bool: + if max_item_age_days is None: + return True + published_date = _parse_iso_date(published_at) + current_date = _parse_run_date(run_date) + if published_date is None or current_date is None: + return True + return (current_date - published_date).days <= max_item_age_days + + +def parse_rss_items( + config: SourceConfig, + xml_text: str, + *, + limit: int = 20, + run_date: str | None = None, +) -> list[dict[str, Any]]: root = ET.fromstring(xml_text) channel = root.find("channel") raw_items = channel.findall("item") if channel is not None else [] items: list[dict[str, Any]] = [] - for raw in raw_items[:limit]: + for raw in raw_items: title = clean_text(raw.findtext("title") or "") if not title: continue summary = clean_text(raw.findtext("description") or "") + published_at = _parse_pubdate(raw.findtext("pubDate") or "") + if not _within_max_item_age( + published_at, + run_date=run_date, + max_item_age_days=config.max_item_age_days, + ): + continue items.append( { "source_group": config.name, @@ -37,15 +79,16 @@ def parse_rss_items(config: SourceConfig, xml_text: str, *, limit: int = 20) -> "title_raw": title, "summary_raw": summary, "url": (raw.findtext("link") or "").strip(), - "published_at": _parse_pubdate(raw.findtext("pubDate") or ""), + "published_at": published_at, "origin_type": "rss", "section_hint": "", "language_hint": "en" if title.encode("utf-8").isascii() else "zh", } ) + if len(items) >= limit: + break return items def fetch_rss(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]: - return parse_rss_items(config, fetch_text(config.url, config.timeout_seconds)) - + return parse_rss_items(config, fetch_text(config.url, config.timeout_seconds), run_date=run_date) diff --git a/config/pipeline.json b/config/pipeline.json index 427507a..7b8266b 100644 --- a/config/pipeline.json +++ b/config/pipeline.json @@ -11,6 +11,10 @@ ], "rewrite_batch_size": 10, "semantic_dedup_max_deletion_ratio": 0.5, - "default_mode": "dry-run" + "default_mode": "dry-run", + "cross_day_dedup": { + "enabled": true, + "max_age_days": 7, + "history_path": "~/.hermes/scripts/ai_morning_out/published_urls.json" + } } - diff --git a/config/sources.json b/config/sources.json index f72725a..d10cdc4 100644 --- a/config/sources.json +++ b/config/sources.json @@ -19,6 +19,7 @@ "priority": 40, "timeout_seconds": 25, "retries": 1, + "max_item_age_days": 3, "enabled": true }, { @@ -30,6 +31,7 @@ "priority": 50, "timeout_seconds": 25, "retries": 1, + "max_item_age_days": 5, "enabled": true }, { @@ -55,4 +57,3 @@ "enabled": true } ] - diff --git a/tests/test_config_loading.py b/tests/test_config_loading.py index cf80a19..5fea835 100644 --- a/tests/test_config_loading.py +++ b/tests/test_config_loading.py @@ -16,6 +16,12 @@ class ConfigLoadingTests(unittest.TestCase): self.assertEqual(configs[0].name, "AI HOT") self.assertEqual(configs[0].type, "aihot") + def test_rss_configs_can_set_max_item_age_days(self): + configs = load_source_configs(ROOT / "config" / "sources.json") + by_name = {config.name: config for config in configs} + + self.assertEqual(by_name["InfoQ AI"].max_item_age_days, 3) + def test_all_configured_source_types_are_registered(self): configs = load_source_configs(ROOT / "config" / "sources.json") diff --git a/tests/test_rss.py b/tests/test_rss.py new file mode 100644 index 0000000..098bdc5 --- /dev/null +++ b/tests/test_rss.py @@ -0,0 +1,58 @@ +import unittest + +from ai_daily_report.models import SourceConfig +from ai_daily_report.sources.rss import parse_rss_items + + +class RssSourceTests(unittest.TestCase): + def test_parse_rss_items_filters_entries_older_than_configured_age(self): + config = SourceConfig( + name="InfoQ AI", + type="rss", + url="https://feed.example/rss", + max_item_age_days=3, + ) + xml = """ + + + Fresh item + https://example.com/fresh + Fresh summary + Sun, 07 Jun 2026 06:25:00 GMT + + + Old item + https://example.com/old + Old summary + Mon, 01 Jun 2026 06:25:00 GMT + +""" + + items = parse_rss_items(config, xml, run_date="2026-06-08") + + self.assertEqual([item["title_raw"] for item in items], ["Fresh item"]) + + def test_parse_rss_items_keeps_unparseable_dates_to_avoid_false_drops(self): + config = SourceConfig( + name="InfoQ AI", + type="rss", + url="https://feed.example/rss", + max_item_age_days=3, + ) + xml = """ + + + No date item + https://example.com/no-date + No date summary + not a date + +""" + + items = parse_rss_items(config, xml, run_date="2026-06-08") + + self.assertEqual([item["title_raw"] for item in items], ["No date item"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_runner.py b/tests/test_runner.py index 5086f91..9f7249a 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -3,6 +3,7 @@ import json from pathlib import Path from tempfile import TemporaryDirectory +from ai_daily_report.publish import load_published_urls from ai_daily_report.runner import run_daily_report @@ -127,6 +128,36 @@ class RunnerTests(unittest.TestCase): self.assertGreaterEqual(len(fake_client.prompts), 2) self.assertEqual(result["reports"]["stage8"]["status"], "ok") + def test_run_daily_report_publish_updates_published_url_history(self): + class FakeBlogClient: + def __init__(self, **kwargs): + self.kwargs = kwargs + + def create_post(self, payload): + return {"slug": payload["slug"]} + + def publish_post(self, slug): + self.slug = slug + + with TemporaryDirectory() as temp_dir: + history_path = Path(temp_dir) / "published_urls.json" + result = run_daily_report( + run_date="2026-06-08", + mode="publish", + source_mode="mock", + llm_mode="mock", + out_dir=Path(temp_dir) / "out", + base_url="https://blog.example", + env={"BLOG_SERVICE_TOKEN": "token"}, + blog_client_factory=FakeBlogClient, + history_path=history_path, + ) + history = load_published_urls(history_path) + + self.assertEqual(result["reports"]["stage8"]["status"], "ok") + self.assertIn("https://example.com/gpt5", history.urls) + self.assertEqual(history.urls["https://example.com/gpt5"].last_published, "2026-06-08") + if __name__ == "__main__": unittest.main() diff --git a/tests/test_stage0_to_4_pipeline.py b/tests/test_stage0_to_4_pipeline.py index 334c09a..5295922 100644 --- a/tests/test_stage0_to_4_pipeline.py +++ b/tests/test_stage0_to_4_pipeline.py @@ -2,6 +2,7 @@ import json import unittest from ai_daily_report.pipeline import run_stage0_to_stage4 +from ai_daily_report.models import PublishedUrlEntry, PublishedUrls class Stage0To4PipelineTests(unittest.TestCase): @@ -61,6 +62,71 @@ class Stage0To4PipelineTests(unittest.TestCase): self.assertIn("stage4", result["reports"]) self.assertEqual(result["reports"]["stage4"]["rewritten_count"], 2) + def test_run_stage0_to_stage4_filters_published_urls_before_semantic_dedupe(self): + configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}] + seen_semantic_payloads = [] + seen_rewrite_payloads = [] + + def fetcher(config, run_date): + return [ + { + "title_raw": "Already published", + "summary_raw": "Old summary", + "url": "https://example.com/already", + "source_label": config.name, + }, + { + "title_raw": "Fresh story", + "summary_raw": "Fresh summary", + "url": "https://example.com/fresh", + "source_label": config.name, + }, + ] + + def semantic_llm_call(prompt): + seen_semantic_payloads.append(json.loads(prompt)) + return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []}) + + def rewrite_llm_call(prompt): + payload = json.loads(prompt) + seen_rewrite_payloads.append(payload) + return json.dumps( + { + "rewrites": [ + { + "id": entry["id"], + "title": entry["title_raw"], + "summary": entry["summary_raw"], + "flags": [], + } + for entry in payload["items"] + ] + } + ) + + published_urls = PublishedUrls( + urls={ + "https://example.com/already": PublishedUrlEntry( + first_seen="2026-06-07", + last_published="2026-06-07", + titles=["Already published"], + ) + } + ) + + result = run_stage0_to_stage4( + configs, + "2026-06-08", + fetcher=fetcher, + semantic_llm_call=semantic_llm_call, + rewrite_llm_call=rewrite_llm_call, + published_urls=published_urls, + ) + + self.assertEqual([entry.title_raw for entry in result["items"]], ["Fresh story"]) + self.assertEqual(result["reports"]["stage2_5"]["removed_count"], 1) + self.assertEqual([entry["title_raw"] for entry in seen_rewrite_payloads[0]["items"]], ["Fresh story"]) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_stage2_dedupe.py b/tests/test_stage2_dedupe.py index 0809889..2524c2b 100644 --- a/tests/test_stage2_dedupe.py +++ b/tests/test_stage2_dedupe.py @@ -1,7 +1,7 @@ import unittest -from ai_daily_report.dedupe import hard_dedup_items -from ai_daily_report.models import NewsItem +from ai_daily_report.dedupe import cross_day_dedup_items, hard_dedup_items +from ai_daily_report.models import NewsItem, PublishedUrlEntry, PublishedUrls def item( @@ -58,6 +58,72 @@ class Stage2DedupeTests(unittest.TestCase): self.assertEqual(len(report["possible_duplicates"]), 1) self.assertEqual(set(report["possible_duplicates"][0]["item_ids"]), {"a", "b"}) + def test_hard_dedup_marks_lower_similarity_mixed_language_titles_as_candidates(self): + items = [ + item("a", "OpenAI custom chip lead Clive Chan joins Anthropic", "openai定制芯片核心成员clivechan跳槽至anthropic", "https://example.com/a", "https://example.com/a"), + item("b", "OpenAI chip core member defects to Anthropic before mass production", "openai芯片核心叛逃anthropic就在量产前夜", "https://example.com/b", "https://example.com/b"), + ] + + deduped, report = hard_dedup_items(items) + + self.assertEqual(len(deduped), 2) + self.assertEqual(report["removed_count"], 0) + self.assertEqual(len(report["possible_duplicates"]), 1) + self.assertEqual(set(report["possible_duplicates"][0]["item_ids"]), {"a", "b"}) + + def test_cross_day_dedup_filters_recently_published_canonical_urls_only(self): + items = [ + item("old", "Old URL", "oldurl", "https://example.com/old", "https://example.com/old"), + item("new", "New URL", "newurl", "https://example.com/new", "https://example.com/new"), + item("missing", "Missing URL", "missingurl", "", ""), + ] + published_urls = PublishedUrls( + urls={ + "https://example.com/old": PublishedUrlEntry( + first_seen="2026-06-07", + last_published="2026-06-07", + titles=["Old URL"], + ) + } + ) + + deduped, report = cross_day_dedup_items( + items, + published_urls, + run_date="2026-06-08", + max_age_days=7, + ) + + self.assertEqual([entry.id for entry in deduped], ["new", "missing"]) + self.assertEqual(report["input_count"], 3) + self.assertEqual(report["output_count"], 2) + self.assertEqual(report["removed_count"], 1) + self.assertEqual(report["removed"][0]["item_id"], "old") + + def test_cross_day_dedup_ignores_urls_outside_history_window(self): + items = [ + item("stale", "Stale URL", "staleurl", "https://example.com/stale", "https://example.com/stale"), + ] + published_urls = PublishedUrls( + urls={ + "https://example.com/stale": PublishedUrlEntry( + first_seen="2026-05-01", + last_published="2026-05-01", + titles=["Stale URL"], + ) + } + ) + + deduped, report = cross_day_dedup_items( + items, + published_urls, + run_date="2026-06-08", + max_age_days=7, + ) + + self.assertEqual([entry.id for entry in deduped], ["stale"]) + self.assertEqual(report["removed_count"], 0) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_stage8_publish.py b/tests/test_stage8_publish.py index 0f7e342..169c6ae 100644 --- a/tests/test_stage8_publish.py +++ b/tests/test_stage8_publish.py @@ -1,6 +1,9 @@ import unittest +from pathlib import Path +from tempfile import TemporaryDirectory -from ai_daily_report.publish import publish_markdown +from ai_daily_report.models import NewsItem +from ai_daily_report.publish import load_published_urls, publish_markdown, update_published_urls class FakeBlogClient: @@ -71,6 +74,46 @@ class Stage8PublishTests(unittest.TestCase): self.assertEqual(client.published_slug, "ai-2026-06-04") self.assertEqual(result.blog_url, "https://blog.example/posts/ai-2026-06-04") + def test_update_published_urls_writes_canonical_urls_for_final_items(self): + with TemporaryDirectory() as temp_dir: + history_path = Path(temp_dir) / "published_urls.json" + items = [ + NewsItem( + id="a", + source_group="AI HOT", + source_label="AI HOT", + source_role="primary", + source_priority=10, + title_raw="Fresh story", + title_norm="freshstory", + summary_raw="summary", + url="https://example.com/fresh?utm_source=x", + canonical_url="https://example.com/fresh", + title="Fresh story", + ), + NewsItem( + id="missing", + source_group="AI HOT", + source_label="AI HOT", + source_role="primary", + source_priority=10, + title_raw="Missing URL", + title_norm="missingurl", + summary_raw="summary", + url="", + canonical_url="", + ), + ] + + update_published_urls(history_path, items, run_date="2026-06-08", max_age_days=7) + loaded = load_published_urls(history_path) + + self.assertIn("https://example.com/fresh", loaded.urls) + self.assertNotIn("", loaded.urls) + self.assertEqual(loaded.urls["https://example.com/fresh"].first_seen, "2026-06-08") + self.assertEqual(loaded.urls["https://example.com/fresh"].last_published, "2026-06-08") + self.assertEqual(loaded.urls["https://example.com/fresh"].titles, ["Fresh story"]) + if __name__ == "__main__": unittest.main()