fix: add cross-day dedupe

2026-06-08 12:05:45 +08:00
parent 2671aee850
commit 07786e3bc0
16 changed files with 671 additions and 21 deletions
--- a/ai_daily_report/cli.py
+++ b/ai_daily_report/cli.py
@@ -17,6 +17,8 @@ def build_parser() -> argparse.ArgumentParser:
    run.add_argument("--out-dir", default="runs")
    run.add_argument("--base-url", default="https://blog.ephron.ren")
    run.add_argument("--sources-path", default=None)
+    run.add_argument("--pipeline-path", default=None)
+    run.add_argument("--history-path", default=None)
    return parser


@@ -32,6 +34,8 @@ def main(argv: list[str] | None = None) -> int:
            out_dir=Path(args.out_dir),
            base_url=args.base_url,
            sources_path=Path(args.sources_path) if args.sources_path else None,
+            pipeline_path=Path(args.pipeline_path) if args.pipeline_path else None,
+            history_path=Path(args.history_path) if args.history_path else None,
        )
    return 0

--- a/ai_daily_report/config.py
+++ b/ai_daily_report/config.py
@@ -17,3 +17,12 @@ def load_source_configs(path: Path) -> list[SourceConfig]:
    if not isinstance(raw, list):
        raise ValueError("sources config must be a list")
    return [_source_config_from_dict(item) for item in raw]
+
+
+def load_pipeline_config(path: Path) -> dict[str, Any]:
+    if not path.exists():
+        return {}
+    raw = load_json(path)
+    if not isinstance(raw, dict):
+        raise ValueError("pipeline config must be an object")
+    return raw
--- a/ai_daily_report/dedupe.py
+++ b/ai_daily_report/dedupe.py
@@ -1,9 +1,16 @@
 from __future__ import annotations

 import difflib
+import re
+from datetime import date, datetime
 from typing import Any

-from .models import NewsItem
+from .models import NewsItem, PublishedUrlEntry, PublishedUrls
+
+
+TITLE_SIMILARITY_THRESHOLD = 0.50
+TOKEN_JACCARD_THRESHOLD = 0.40
+TOKEN_EDIT_DISTANCE_THRESHOLD = 0.40


 def _item_score(item: NewsItem) -> int:
@@ -52,6 +59,18 @@ def _group_by_key(items: list[NewsItem], key_name: str) -> dict[str, list[NewsIt
    return {key: group for key, group in groups.items() if len(group) > 1}


+def _title_tokens(value: str) -> set[str]:
+    if not value:
+        return set()
+    return set(re.findall(r"[a-z0-9]+|[\u4e00-\u9fff]", value.lower()))
+
+
+def _jaccard_similarity(left: set[str], right: set[str]) -> float:
+    if not left or not right:
+        return 0.0
+    return len(left & right) / len(left | right)
+
+
 def _possible_duplicates(items: list[NewsItem]) -> list[dict[str, Any]]:
    possible: list[dict[str, Any]] = []
    for index, left in enumerate(items):
@@ -59,12 +78,16 @@ def _possible_duplicates(items: list[NewsItem]) -> list[dict[str, Any]]:
            if not left.title_norm or not right.title_norm:
                continue
            ratio = difflib.SequenceMatcher(None, left.title_norm, right.title_norm).ratio()
-            if ratio >= 0.65:
+            jaccard = _jaccard_similarity(_title_tokens(left.title_norm), _title_tokens(right.title_norm))
+            if ratio >= TITLE_SIMILARITY_THRESHOLD or (
+                ratio >= TOKEN_EDIT_DISTANCE_THRESHOLD and jaccard >= TOKEN_JACCARD_THRESHOLD
+            ):
                possible.append(
                    {
                        "item_ids": [left.id, right.id],
                        "reason": "title_similarity",
                        "similarity": round(ratio, 3),
+                        "token_jaccard": round(jaccard, 3),
                        "confidence": "medium",
                    }
                )
@@ -98,3 +121,62 @@ def hard_dedup_items(items: list[NewsItem]) -> tuple[list[NewsItem], dict[str, A
        "possible_duplicates": _possible_duplicates(deduped),
    }
    return deduped, report
+
+
+def _parse_date(value: str | None) -> date | None:
+    if not value:
+        return None
+    text = value.strip()
+    try:
+        return date.fromisoformat(text[:10])
+    except ValueError:
+        try:
+            return datetime.fromisoformat(text).date()
+        except ValueError:
+            return None
+
+
+def _entry_within_window(entry: PublishedUrlEntry, *, run_date: str, max_age_days: int) -> bool:
+    if max_age_days < 0:
+        return True
+    current = _parse_date(run_date)
+    previous = _parse_date(entry.last_published) or _parse_date(entry.first_seen)
+    if current is None or previous is None:
+        return True
+    return (current - previous).days <= max_age_days
+
+
+def cross_day_dedup_items(
+    items: list[NewsItem],
+    published_urls: PublishedUrls | None,
+    *,
+    run_date: str,
+    max_age_days: int = 7,
+) -> tuple[list[NewsItem], dict[str, Any]]:
+    history = published_urls or PublishedUrls()
+    deduped: list[NewsItem] = []
+    removed: list[dict[str, Any]] = []
+
+    for item in items:
+        entry = history.urls.get(item.canonical_url) if item.canonical_url else None
+        if entry and _entry_within_window(entry, run_date=run_date, max_age_days=max_age_days):
+            removed.append(
+                {
+                    "item_id": item.id,
+                    "canonical_url": item.canonical_url,
+                    "title": item.title or item.title_raw,
+                    "first_seen": entry.first_seen,
+                    "last_published": entry.last_published,
+                }
+            )
+            continue
+        deduped.append(item)
+
+    report = {
+        "input_count": len(items),
+        "output_count": len(deduped),
+        "removed_count": len(removed),
+        "removed": removed,
+        "max_age_days": max_age_days,
+    }
+    return deduped, report
--- a/ai_daily_report/models.py
+++ b/ai_daily_report/models.py
@@ -14,6 +14,7 @@ class SourceConfig:
    retries: int = 0
    min_items: int = 0
    url: str = ""
+    max_item_age_days: int | None = None


@dataclass
@@ -51,3 +52,17 @@ class NewsItem:
    section: str | None = None
    quality_flags: list[str] = field(default_factory=list)
    duplicate_sources: list[dict[str, Any]] = field(default_factory=list)
+
+
+@dataclass
+class PublishedUrlEntry:
+    first_seen: str
+    last_published: str
+    titles: list[str] = field(default_factory=list)
+
+
+@dataclass
+class PublishedUrls:
+    version: int = 1
+    urls: dict[str, PublishedUrlEntry] = field(default_factory=dict)
+    updated_at: str = ""
--- a/ai_daily_report/pipeline.py
+++ b/ai_daily_report/pipeline.py
@@ -5,9 +5,9 @@ from typing import Any
 from .assemble import assemble_markdown
 from .classify import classify_and_order_items
 from .collect import Fetcher, collect_sources
-from .dedupe import hard_dedup_items
+from .dedupe import cross_day_dedup_items, hard_dedup_items
 from .guide import GuideLlmCall, generate_guide
-from .models import SourceConfig
+from .models import PublishedUrls, SourceConfig
 from .normalize import normalize_items
 from .publish import BlogClient, publish_markdown
 from .rewrite import RewriteLlmCall, rewrite_items
@@ -15,6 +15,7 @@ from .semantic_dedupe import SemanticLlmCall, semantic_dedup_items


 def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig:
+    max_item_age_days = value.get("max_item_age_days")
    return SourceConfig(
        name=value["name"],
        type=value["type"],
@@ -26,6 +27,7 @@ def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig:
        retries=int(value.get("retries", 0)),
        min_items=int(value.get("min_items", 0)),
        url=value.get("url", ""),
+        max_item_age_days=int(max_item_age_days) if max_item_age_days is not None else None,
    )


@@ -58,6 +60,43 @@ def run_stage0_to_stage2(
    }


+def run_stage0_to_stage2_5(
+    source_configs: list[dict[str, Any] | SourceConfig],
+    run_date: str,
+    *,
+    fetcher: Fetcher,
+    published_urls: PublishedUrls | None = None,
+    cross_day_dedup_enabled: bool = True,
+    cross_day_dedup_max_age_days: int = 7,
+) -> dict[str, Any]:
+    stage2_result = run_stage0_to_stage2(source_configs, run_date, fetcher=fetcher)
+    if cross_day_dedup_enabled:
+        items, stage2_5_report = cross_day_dedup_items(
+            stage2_result["items"],
+            published_urls,
+            run_date=run_date,
+            max_age_days=cross_day_dedup_max_age_days,
+        )
+    else:
+        items = stage2_result["items"]
+        stage2_5_report = {
+            "input_count": len(items),
+            "output_count": len(items),
+            "removed_count": 0,
+            "removed": [],
+            "enabled": False,
+            "max_age_days": cross_day_dedup_max_age_days,
+        }
+    reports = dict(stage2_result["reports"])
+    stage2_5_report.setdefault("enabled", cross_day_dedup_enabled)
+    reports["stage2_5"] = stage2_5_report
+    return {
+        "source_results": stage2_result["source_results"],
+        "items": items,
+        "reports": reports,
+    }
+
+
 def run_stage0_to_stage4(
    source_configs: list[dict[str, Any] | SourceConfig],
    run_date: str,
@@ -65,10 +104,25 @@ def run_stage0_to_stage4(
    fetcher: Fetcher,
    semantic_llm_call: SemanticLlmCall,
    rewrite_llm_call: RewriteLlmCall,
+    published_urls: PublishedUrls | None = None,
+    cross_day_dedup_enabled: bool = True,
+    cross_day_dedup_max_age_days: int = 7,
 ) -> dict[str, Any]:
-    stage2_result = run_stage0_to_stage2(source_configs, run_date, fetcher=fetcher)
-    items = stage2_result["items"]
-    candidates = stage2_result["reports"]["stage2"].get("possible_duplicates", [])
+    stage2_5_result = run_stage0_to_stage2_5(
+        source_configs,
+        run_date,
+        fetcher=fetcher,
+        published_urls=published_urls,
+        cross_day_dedup_enabled=cross_day_dedup_enabled,
+        cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
+    )
+    items = stage2_5_result["items"]
+    remaining_ids = {item.id for item in items}
+    candidates = [
+        candidate
+        for candidate in stage2_5_result["reports"]["stage2"].get("possible_duplicates", [])
+        if set(candidate.get("item_ids", [])).issubset(remaining_ids)
+    ]
    semantic_items, stage3_report = semantic_dedup_items(
        items,
        candidates,
@@ -78,11 +132,11 @@ def run_stage0_to_stage4(
        semantic_items,
        llm_call=rewrite_llm_call,
    )
-    reports = dict(stage2_result["reports"])
+    reports = dict(stage2_5_result["reports"])
    reports["stage3"] = stage3_report
    reports["stage4"] = stage4_report
    return {
-        "source_results": stage2_result["source_results"],
+        "source_results": stage2_5_result["source_results"],
        "items": rewritten_items,
        "reports": reports,
    }
@@ -95,6 +149,9 @@ def run_stage0_to_stage5(
    fetcher: Fetcher,
    semantic_llm_call: SemanticLlmCall,
    rewrite_llm_call: RewriteLlmCall,
+    published_urls: PublishedUrls | None = None,
+    cross_day_dedup_enabled: bool = True,
+    cross_day_dedup_max_age_days: int = 7,
 ) -> dict[str, Any]:
    stage4_result = run_stage0_to_stage4(
        source_configs,
@@ -102,6 +159,9 @@ def run_stage0_to_stage5(
        fetcher=fetcher,
        semantic_llm_call=semantic_llm_call,
        rewrite_llm_call=rewrite_llm_call,
+        published_urls=published_urls,
+        cross_day_dedup_enabled=cross_day_dedup_enabled,
+        cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
    )
    classified_items, stage5_report = classify_and_order_items(stage4_result["items"])
    reports = dict(stage4_result["reports"])
@@ -121,6 +181,9 @@ def run_stage0_to_stage6(
    semantic_llm_call: SemanticLlmCall,
    rewrite_llm_call: RewriteLlmCall,
    guide_llm_call: GuideLlmCall,
+    published_urls: PublishedUrls | None = None,
+    cross_day_dedup_enabled: bool = True,
+    cross_day_dedup_max_age_days: int = 7,
 ) -> dict[str, Any]:
    stage5_result = run_stage0_to_stage5(
        source_configs,
@@ -128,6 +191,9 @@ def run_stage0_to_stage6(
        fetcher=fetcher,
        semantic_llm_call=semantic_llm_call,
        rewrite_llm_call=rewrite_llm_call,
+        published_urls=published_urls,
+        cross_day_dedup_enabled=cross_day_dedup_enabled,
+        cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
    )
    guide, stage6_report = generate_guide(stage5_result["items"], llm_call=guide_llm_call)
    reports = dict(stage5_result["reports"])
@@ -148,6 +214,9 @@ def run_stage0_to_stage7(
    semantic_llm_call: SemanticLlmCall,
    rewrite_llm_call: RewriteLlmCall,
    guide_llm_call: GuideLlmCall,
+    published_urls: PublishedUrls | None = None,
+    cross_day_dedup_enabled: bool = True,
+    cross_day_dedup_max_age_days: int = 7,
 ) -> dict[str, Any]:
    stage6_result = run_stage0_to_stage6(
        source_configs,
@@ -156,6 +225,9 @@ def run_stage0_to_stage7(
        semantic_llm_call=semantic_llm_call,
        rewrite_llm_call=rewrite_llm_call,
        guide_llm_call=guide_llm_call,
+        published_urls=published_urls,
+        cross_day_dedup_enabled=cross_day_dedup_enabled,
+        cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
    )
    markdown, stage7_report = assemble_markdown(stage6_result["items"], stage6_result["guide"])
    upstream_blocking_errors: list[str] = []
@@ -187,6 +259,9 @@ def run_stage0_to_stage8(
    mode: str,
    base_url: str,
    client: BlogClient | None,
+    published_urls: PublishedUrls | None = None,
+    cross_day_dedup_enabled: bool = True,
+    cross_day_dedup_max_age_days: int = 7,
 ) -> dict[str, Any]:
    stage7_result = run_stage0_to_stage7(
        source_configs,
@@ -195,6 +270,9 @@ def run_stage0_to_stage8(
        semantic_llm_call=semantic_llm_call,
        rewrite_llm_call=rewrite_llm_call,
        guide_llm_call=guide_llm_call,
+        published_urls=published_urls,
+        cross_day_dedup_enabled=cross_day_dedup_enabled,
+        cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
    )
    slug = f"ai-{run_date}"
    publish_result = publish_markdown(
--- a/ai_daily_report/publish.py
+++ b/ai_daily_report/publish.py
@@ -1,8 +1,13 @@
 from __future__ import annotations

+import json
 from dataclasses import dataclass
+from datetime import date, datetime, timezone
+from pathlib import Path
 from typing import Any, Protocol

+from .models import NewsItem, PublishedUrlEntry, PublishedUrls
+

@dataclass
 class PublishResult:
@@ -22,6 +27,122 @@ class BlogClient(Protocol):
        ...


+def _parse_date(value: str | None) -> date | None:
+    if not value:
+        return None
+    text = value.strip()
+    try:
+        return date.fromisoformat(text[:10])
+    except ValueError:
+        try:
+            return datetime.fromisoformat(text).date()
+        except ValueError:
+            return None
+
+
+def _published_entry_from_dict(value: Any) -> PublishedUrlEntry | None:
+    if not isinstance(value, dict):
+        return None
+    first_seen = str(value.get("first_seen") or "")
+    last_published = str(value.get("last_published") or first_seen)
+    titles = [str(title) for title in value.get("titles", []) or [] if str(title)]
+    if not first_seen and not last_published:
+        return None
+    return PublishedUrlEntry(
+        first_seen=first_seen or last_published,
+        last_published=last_published or first_seen,
+        titles=titles,
+    )
+
+
+def load_published_urls(path: Path) -> PublishedUrls:
+    if not path.exists():
+        return PublishedUrls()
+    try:
+        raw = json.loads(path.read_text(encoding="utf-8"))
+    except Exception:
+        return PublishedUrls()
+    if not isinstance(raw, dict):
+        return PublishedUrls()
+
+    urls: dict[str, PublishedUrlEntry] = {}
+    for canonical_url, value in (raw.get("urls") or {}).items():
+        if not canonical_url:
+            continue
+        entry = _published_entry_from_dict(value)
+        if entry is not None:
+            urls[str(canonical_url)] = entry
+    return PublishedUrls(
+        version=int(raw.get("version") or 1),
+        urls=urls,
+        updated_at=str(raw.get("updated_at") or ""),
+    )
+
+
+def _entry_within_window(entry: PublishedUrlEntry, *, run_date: str, max_age_days: int) -> bool:
+    if max_age_days < 0:
+        return True
+    current = _parse_date(run_date)
+    previous = _parse_date(entry.last_published) or _parse_date(entry.first_seen)
+    if current is None or previous is None:
+        return True
+    return (current - previous).days <= max_age_days
+
+
+def _published_urls_to_dict(history: PublishedUrls) -> dict[str, Any]:
+    return {
+        "version": history.version,
+        "urls": {
+            canonical_url: {
+                "first_seen": entry.first_seen,
+                "last_published": entry.last_published,
+                "titles": entry.titles,
+            }
+            for canonical_url, entry in sorted(history.urls.items())
+        },
+        "updated_at": history.updated_at,
+    }
+
+
+def update_published_urls(
+    path: Path,
+    items: list[NewsItem],
+    *,
+    run_date: str,
+    max_age_days: int = 7,
+) -> PublishedUrls:
+    history = load_published_urls(path)
+    history.urls = {
+        canonical_url: entry
+        for canonical_url, entry in history.urls.items()
+        if _entry_within_window(entry, run_date=run_date, max_age_days=max_age_days)
+    }
+
+    for item in items:
+        if not item.canonical_url:
+            continue
+        title = item.title or item.title_raw
+        entry = history.urls.get(item.canonical_url)
+        if entry is None:
+            entry = PublishedUrlEntry(
+                first_seen=run_date,
+                last_published=run_date,
+                titles=[],
+            )
+            history.urls[item.canonical_url] = entry
+        entry.last_published = run_date
+        if title and title not in entry.titles:
+            entry.titles.append(title)
+
+    history.updated_at = datetime.now(timezone.utc).isoformat()
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(
+        json.dumps(_published_urls_to_dict(history), ensure_ascii=False, indent=2),
+        encoding="utf-8",
+    )
+    return history
+
+
 def dry_run_publish(slug: str, base_url: str) -> PublishResult:
    return PublishResult(
        mode="dry-run",
--- a/ai_daily_report/runner.py
+++ b/ai_daily_report/runner.py
@@ -6,10 +6,11 @@ from pathlib import Path
 from typing import Any

 from .clients import BlogApiClient, OpenAICompatibleClient, fetch_text as default_fetch_text
-from .config import load_source_configs
+from .config import load_pipeline_config, load_source_configs
 from .env import load_env, resolve_blog_token, resolve_llm_config
 from .models import SourceConfig
 from .pipeline import run_stage0_to_stage8
+from .publish import load_published_urls, update_published_urls
 from .sources.registry import get_source_fetcher


@@ -89,6 +90,8 @@ def run_daily_report(
    out_dir: Path,
    base_url: str,
    sources_path: Path | None = None,
+    pipeline_path: Path | None = None,
+    history_path: Path | None = None,
    fetch_text=None,
    env: dict[str, str] | None = None,
    llm_client_factory=OpenAICompatibleClient,
@@ -96,6 +99,15 @@ def run_daily_report(
 ) -> dict[str, Any]:
    fetch_text = fetch_text or default_fetch_text
    env = env if env is not None else load_env()
+    pipeline_config_path = pipeline_path or Path("config") / "pipeline.json"
+    pipeline_config = load_pipeline_config(pipeline_config_path)
+    cross_day_config = pipeline_config.get("cross_day_dedup", {}) or {}
+    cross_day_enabled = bool(cross_day_config.get("enabled", True))
+    cross_day_max_age_days = int(cross_day_config.get("max_age_days", 7))
+    configured_history_path = history_path or Path(
+        str(cross_day_config.get("history_path") or "~/.hermes/scripts/ai_morning_out/published_urls.json")
+    ).expanduser()
+    published_urls = load_published_urls(configured_history_path) if cross_day_enabled else None

    if source_mode == "mock":
        source_configs = _mock_source_configs()
@@ -141,8 +153,19 @@ def run_daily_report(
        mode=mode,
        base_url=base_url,
        client=blog_client,
+        published_urls=published_urls,
+        cross_day_dedup_enabled=cross_day_enabled,
+        cross_day_dedup_max_age_days=cross_day_max_age_days,
    )

+    if cross_day_enabled and result["publish"].mode == "publish" and result["publish"].status == "ok":
+        update_published_urls(
+            configured_history_path,
+            result["items"],
+            run_date=run_date,
+            max_age_days=cross_day_max_age_days,
+        )
+
    run_dir = out_dir / run_date
    run_dir.mkdir(parents=True, exist_ok=True)
    (run_dir / "blog_markdown.md").write_text(result["markdown"], encoding="utf-8")
--- a/ai_daily_report/sources/rss.py
+++ b/ai_daily_report/sources/rss.py
@@ -1,6 +1,7 @@
 from __future__ import annotations

 import xml.etree.ElementTree as ET
+from datetime import date, datetime
 from email.utils import parsedate_to_datetime
 from typing import Any, Callable

@@ -20,16 +21,57 @@ def _parse_pubdate(value: str) -> str | None:
        return None


-def parse_rss_items(config: SourceConfig, xml_text: str, *, limit: int = 20) -> list[dict[str, Any]]:
+def _parse_run_date(value: str | None) -> date | None:
+    if not value:
+        return None
+    try:
+        return date.fromisoformat(value[:10])
+    except ValueError:
+        return None
+
+
+def _parse_iso_date(value: str | None) -> date | None:
+    if not value:
+        return None
+    try:
+        return datetime.fromisoformat(value).date()
+    except ValueError:
+        return None
+
+
+def _within_max_item_age(published_at: str | None, *, run_date: str | None, max_item_age_days: int | None) -> bool:
+    if max_item_age_days is None:
+        return True
+    published_date = _parse_iso_date(published_at)
+    current_date = _parse_run_date(run_date)
+    if published_date is None or current_date is None:
+        return True
+    return (current_date - published_date).days <= max_item_age_days
+
+
+def parse_rss_items(
+    config: SourceConfig,
+    xml_text: str,
+    *,
+    limit: int = 20,
+    run_date: str | None = None,
+) -> list[dict[str, Any]]:
    root = ET.fromstring(xml_text)
    channel = root.find("channel")
    raw_items = channel.findall("item") if channel is not None else []
    items: list[dict[str, Any]] = []
-    for raw in raw_items[:limit]:
+    for raw in raw_items:
        title = clean_text(raw.findtext("title") or "")
        if not title:
            continue
        summary = clean_text(raw.findtext("description") or "")
+        published_at = _parse_pubdate(raw.findtext("pubDate") or "")
+        if not _within_max_item_age(
+            published_at,
+            run_date=run_date,
+            max_item_age_days=config.max_item_age_days,
+        ):
+            continue
        items.append(
            {
                "source_group": config.name,
@@ -37,15 +79,16 @@ def parse_rss_items(config: SourceConfig, xml_text: str, *, limit: int = 20) ->
                "title_raw": title,
                "summary_raw": summary,
                "url": (raw.findtext("link") or "").strip(),
-                "published_at": _parse_pubdate(raw.findtext("pubDate") or ""),
+                "published_at": published_at,
                "origin_type": "rss",
                "section_hint": "",
                "language_hint": "en" if title.encode("utf-8").isascii() else "zh",
            }
        )
+        if len(items) >= limit:
+            break
    return items


 def fetch_rss(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
-    return parse_rss_items(config, fetch_text(config.url, config.timeout_seconds))
-
+    return parse_rss_items(config, fetch_text(config.url, config.timeout_seconds), run_date=run_date)
--- a/config/pipeline.json
+++ b/config/pipeline.json
@@ -11,6 +11,10 @@
  ],
  "rewrite_batch_size": 10,
  "semantic_dedup_max_deletion_ratio": 0.5,
-  "default_mode": "dry-run"
+  "default_mode": "dry-run",
+  "cross_day_dedup": {
+    "enabled": true,
+    "max_age_days": 7,
+    "history_path": "~/.hermes/scripts/ai_morning_out/published_urls.json"
+  }
 }
-
--- a/config/sources.json
+++ b/config/sources.json
@@ -19,6 +19,7 @@
    "priority": 40,
    "timeout_seconds": 25,
    "retries": 1,
+    "max_item_age_days": 3,
    "enabled": true
  },
  {
@@ -30,6 +31,7 @@
    "priority": 50,
    "timeout_seconds": 25,
    "retries": 1,
+    "max_item_age_days": 5,
    "enabled": true
  },
  {
@@ -55,4 +57,3 @@
    "enabled": true
  }
 ]
-
--- a/tests/test_config_loading.py
+++ b/tests/test_config_loading.py
@@ -16,6 +16,12 @@ class ConfigLoadingTests(unittest.TestCase):
        self.assertEqual(configs[0].name, "AI HOT")
        self.assertEqual(configs[0].type, "aihot")

+    def test_rss_configs_can_set_max_item_age_days(self):
+        configs = load_source_configs(ROOT / "config" / "sources.json")
+        by_name = {config.name: config for config in configs}
+
+        self.assertEqual(by_name["InfoQ AI"].max_item_age_days, 3)
+
    def test_all_configured_source_types_are_registered(self):
        configs = load_source_configs(ROOT / "config" / "sources.json")

--- a/tests/test_rss.py
+++ b/tests/test_rss.py
@@ -0,0 +1,58 @@
+import unittest
+
+from ai_daily_report.models import SourceConfig
+from ai_daily_report.sources.rss import parse_rss_items
+
+
+class RssSourceTests(unittest.TestCase):
+    def test_parse_rss_items_filters_entries_older_than_configured_age(self):
+        config = SourceConfig(
+            name="InfoQ AI",
+            type="rss",
+            url="https://feed.example/rss",
+            max_item_age_days=3,
+        )
+        xml = """<?xml version="1.0"?>
+<rss><channel>
+  <item>
+    <title>Fresh item</title>
+    <link>https://example.com/fresh</link>
+    <description>Fresh summary</description>
+    <pubDate>Sun, 07 Jun 2026 06:25:00 GMT</pubDate>
+  </item>
+  <item>
+    <title>Old item</title>
+    <link>https://example.com/old</link>
+    <description>Old summary</description>
+    <pubDate>Mon, 01 Jun 2026 06:25:00 GMT</pubDate>
+  </item>
+</channel></rss>"""
+
+        items = parse_rss_items(config, xml, run_date="2026-06-08")
+
+        self.assertEqual([item["title_raw"] for item in items], ["Fresh item"])
+
+    def test_parse_rss_items_keeps_unparseable_dates_to_avoid_false_drops(self):
+        config = SourceConfig(
+            name="InfoQ AI",
+            type="rss",
+            url="https://feed.example/rss",
+            max_item_age_days=3,
+        )
+        xml = """<?xml version="1.0"?>
+<rss><channel>
+  <item>
+    <title>No date item</title>
+    <link>https://example.com/no-date</link>
+    <description>No date summary</description>
+    <pubDate>not a date</pubDate>
+  </item>
+</channel></rss>"""
+
+        items = parse_rss_items(config, xml, run_date="2026-06-08")
+
+        self.assertEqual([item["title_raw"] for item in items], ["No date item"])
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_runner.py
+++ b/tests/test_runner.py
@@ -3,6 +3,7 @@ import json
 from pathlib import Path
 from tempfile import TemporaryDirectory

+from ai_daily_report.publish import load_published_urls
 from ai_daily_report.runner import run_daily_report


@@ -127,6 +128,36 @@ class RunnerTests(unittest.TestCase):
        self.assertGreaterEqual(len(fake_client.prompts), 2)
        self.assertEqual(result["reports"]["stage8"]["status"], "ok")

+    def test_run_daily_report_publish_updates_published_url_history(self):
+        class FakeBlogClient:
+            def __init__(self, **kwargs):
+                self.kwargs = kwargs
+
+            def create_post(self, payload):
+                return {"slug": payload["slug"]}
+
+            def publish_post(self, slug):
+                self.slug = slug
+
+        with TemporaryDirectory() as temp_dir:
+            history_path = Path(temp_dir) / "published_urls.json"
+            result = run_daily_report(
+                run_date="2026-06-08",
+                mode="publish",
+                source_mode="mock",
+                llm_mode="mock",
+                out_dir=Path(temp_dir) / "out",
+                base_url="https://blog.example",
+                env={"BLOG_SERVICE_TOKEN": "token"},
+                blog_client_factory=FakeBlogClient,
+                history_path=history_path,
+            )
+            history = load_published_urls(history_path)
+
+        self.assertEqual(result["reports"]["stage8"]["status"], "ok")
+        self.assertIn("https://example.com/gpt5", history.urls)
+        self.assertEqual(history.urls["https://example.com/gpt5"].last_published, "2026-06-08")
+

 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_stage0_to_4_pipeline.py
+++ b/tests/test_stage0_to_4_pipeline.py
@@ -2,6 +2,7 @@ import json
 import unittest

 from ai_daily_report.pipeline import run_stage0_to_stage4
+from ai_daily_report.models import PublishedUrlEntry, PublishedUrls


 class Stage0To4PipelineTests(unittest.TestCase):
@@ -61,6 +62,71 @@ class Stage0To4PipelineTests(unittest.TestCase):
        self.assertIn("stage4", result["reports"])
        self.assertEqual(result["reports"]["stage4"]["rewritten_count"], 2)

+    def test_run_stage0_to_stage4_filters_published_urls_before_semantic_dedupe(self):
+        configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}]
+        seen_semantic_payloads = []
+        seen_rewrite_payloads = []
+
+        def fetcher(config, run_date):
+            return [
+                {
+                    "title_raw": "Already published",
+                    "summary_raw": "Old summary",
+                    "url": "https://example.com/already",
+                    "source_label": config.name,
+                },
+                {
+                    "title_raw": "Fresh story",
+                    "summary_raw": "Fresh summary",
+                    "url": "https://example.com/fresh",
+                    "source_label": config.name,
+                },
+            ]
+
+        def semantic_llm_call(prompt):
+            seen_semantic_payloads.append(json.loads(prompt))
+            return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
+
+        def rewrite_llm_call(prompt):
+            payload = json.loads(prompt)
+            seen_rewrite_payloads.append(payload)
+            return json.dumps(
+                {
+                    "rewrites": [
+                        {
+                            "id": entry["id"],
+                            "title": entry["title_raw"],
+                            "summary": entry["summary_raw"],
+                            "flags": [],
+                        }
+                        for entry in payload["items"]
+                    ]
+                }
+            )
+
+        published_urls = PublishedUrls(
+            urls={
+                "https://example.com/already": PublishedUrlEntry(
+                    first_seen="2026-06-07",
+                    last_published="2026-06-07",
+                    titles=["Already published"],
+                )
+            }
+        )
+
+        result = run_stage0_to_stage4(
+            configs,
+            "2026-06-08",
+            fetcher=fetcher,
+            semantic_llm_call=semantic_llm_call,
+            rewrite_llm_call=rewrite_llm_call,
+            published_urls=published_urls,
+        )
+
+        self.assertEqual([entry.title_raw for entry in result["items"]], ["Fresh story"])
+        self.assertEqual(result["reports"]["stage2_5"]["removed_count"], 1)
+        self.assertEqual([entry["title_raw"] for entry in seen_rewrite_payloads[0]["items"]], ["Fresh story"])
+

 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_stage2_dedupe.py
+++ b/tests/test_stage2_dedupe.py
@@ -1,7 +1,7 @@
 import unittest

-from ai_daily_report.dedupe import hard_dedup_items
-from ai_daily_report.models import NewsItem
+from ai_daily_report.dedupe import cross_day_dedup_items, hard_dedup_items
+from ai_daily_report.models import NewsItem, PublishedUrlEntry, PublishedUrls


 def item(
@@ -58,6 +58,72 @@ class Stage2DedupeTests(unittest.TestCase):
        self.assertEqual(len(report["possible_duplicates"]), 1)
        self.assertEqual(set(report["possible_duplicates"][0]["item_ids"]), {"a", "b"})

+    def test_hard_dedup_marks_lower_similarity_mixed_language_titles_as_candidates(self):
+        items = [
+            item("a", "OpenAI custom chip lead Clive Chan joins Anthropic", "openai定制芯片核心成员clivechan跳槽至anthropic", "https://example.com/a", "https://example.com/a"),
+            item("b", "OpenAI chip core member defects to Anthropic before mass production", "openai芯片核心叛逃anthropic就在量产前夜", "https://example.com/b", "https://example.com/b"),
+        ]
+
+        deduped, report = hard_dedup_items(items)
+
+        self.assertEqual(len(deduped), 2)
+        self.assertEqual(report["removed_count"], 0)
+        self.assertEqual(len(report["possible_duplicates"]), 1)
+        self.assertEqual(set(report["possible_duplicates"][0]["item_ids"]), {"a", "b"})
+
+    def test_cross_day_dedup_filters_recently_published_canonical_urls_only(self):
+        items = [
+            item("old", "Old URL", "oldurl", "https://example.com/old", "https://example.com/old"),
+            item("new", "New URL", "newurl", "https://example.com/new", "https://example.com/new"),
+            item("missing", "Missing URL", "missingurl", "", ""),
+        ]
+        published_urls = PublishedUrls(
+            urls={
+                "https://example.com/old": PublishedUrlEntry(
+                    first_seen="2026-06-07",
+                    last_published="2026-06-07",
+                    titles=["Old URL"],
+                )
+            }
+        )
+
+        deduped, report = cross_day_dedup_items(
+            items,
+            published_urls,
+            run_date="2026-06-08",
+            max_age_days=7,
+        )
+
+        self.assertEqual([entry.id for entry in deduped], ["new", "missing"])
+        self.assertEqual(report["input_count"], 3)
+        self.assertEqual(report["output_count"], 2)
+        self.assertEqual(report["removed_count"], 1)
+        self.assertEqual(report["removed"][0]["item_id"], "old")
+
+    def test_cross_day_dedup_ignores_urls_outside_history_window(self):
+        items = [
+            item("stale", "Stale URL", "staleurl", "https://example.com/stale", "https://example.com/stale"),
+        ]
+        published_urls = PublishedUrls(
+            urls={
+                "https://example.com/stale": PublishedUrlEntry(
+                    first_seen="2026-05-01",
+                    last_published="2026-05-01",
+                    titles=["Stale URL"],
+                )
+            }
+        )
+
+        deduped, report = cross_day_dedup_items(
+            items,
+            published_urls,
+            run_date="2026-06-08",
+            max_age_days=7,
+        )
+
+        self.assertEqual([entry.id for entry in deduped], ["stale"])
+        self.assertEqual(report["removed_count"], 0)
+

 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_stage8_publish.py
+++ b/tests/test_stage8_publish.py
@@ -1,6 +1,9 @@
 import unittest
+from pathlib import Path
+from tempfile import TemporaryDirectory

-from ai_daily_report.publish import publish_markdown
+from ai_daily_report.models import NewsItem
+from ai_daily_report.publish import load_published_urls, publish_markdown, update_published_urls


 class FakeBlogClient:
@@ -71,6 +74,46 @@ class Stage8PublishTests(unittest.TestCase):
        self.assertEqual(client.published_slug, "ai-2026-06-04")
        self.assertEqual(result.blog_url, "https://blog.example/posts/ai-2026-06-04")

+    def test_update_published_urls_writes_canonical_urls_for_final_items(self):
+        with TemporaryDirectory() as temp_dir:
+            history_path = Path(temp_dir) / "published_urls.json"
+            items = [
+                NewsItem(
+                    id="a",
+                    source_group="AI HOT",
+                    source_label="AI HOT",
+                    source_role="primary",
+                    source_priority=10,
+                    title_raw="Fresh story",
+                    title_norm="freshstory",
+                    summary_raw="summary",
+                    url="https://example.com/fresh?utm_source=x",
+                    canonical_url="https://example.com/fresh",
+                    title="Fresh story",
+                ),
+                NewsItem(
+                    id="missing",
+                    source_group="AI HOT",
+                    source_label="AI HOT",
+                    source_role="primary",
+                    source_priority=10,
+                    title_raw="Missing URL",
+                    title_norm="missingurl",
+                    summary_raw="summary",
+                    url="",
+                    canonical_url="",
+                ),
+            ]
+
+            update_published_urls(history_path, items, run_date="2026-06-08", max_age_days=7)
+            loaded = load_published_urls(history_path)
+
+        self.assertIn("https://example.com/fresh", loaded.urls)
+        self.assertNotIn("", loaded.urls)
+        self.assertEqual(loaded.urls["https://example.com/fresh"].first_seen, "2026-06-08")
+        self.assertEqual(loaded.urls["https://example.com/fresh"].last_published, "2026-06-08")
+        self.assertEqual(loaded.urls["https://example.com/fresh"].titles, ["Fresh story"])
+

 if __name__ == "__main__":
    unittest.main()