fix: add cross-day dedupe

This commit is contained in:
Mimikko-zeus
2026-06-08 12:05:45 +08:00
parent 2671aee850
commit 07786e3bc0
16 changed files with 671 additions and 21 deletions

View File

@@ -17,6 +17,8 @@ def build_parser() -> argparse.ArgumentParser:
run.add_argument("--out-dir", default="runs") run.add_argument("--out-dir", default="runs")
run.add_argument("--base-url", default="https://blog.ephron.ren") run.add_argument("--base-url", default="https://blog.ephron.ren")
run.add_argument("--sources-path", default=None) run.add_argument("--sources-path", default=None)
run.add_argument("--pipeline-path", default=None)
run.add_argument("--history-path", default=None)
return parser return parser
@@ -32,6 +34,8 @@ def main(argv: list[str] | None = None) -> int:
out_dir=Path(args.out_dir), out_dir=Path(args.out_dir),
base_url=args.base_url, base_url=args.base_url,
sources_path=Path(args.sources_path) if args.sources_path else None, sources_path=Path(args.sources_path) if args.sources_path else None,
pipeline_path=Path(args.pipeline_path) if args.pipeline_path else None,
history_path=Path(args.history_path) if args.history_path else None,
) )
return 0 return 0

View File

@@ -17,3 +17,12 @@ def load_source_configs(path: Path) -> list[SourceConfig]:
if not isinstance(raw, list): if not isinstance(raw, list):
raise ValueError("sources config must be a list") raise ValueError("sources config must be a list")
return [_source_config_from_dict(item) for item in raw] return [_source_config_from_dict(item) for item in raw]
def load_pipeline_config(path: Path) -> dict[str, Any]:
if not path.exists():
return {}
raw = load_json(path)
if not isinstance(raw, dict):
raise ValueError("pipeline config must be an object")
return raw

View File

@@ -1,9 +1,16 @@
from __future__ import annotations from __future__ import annotations
import difflib import difflib
import re
from datetime import date, datetime
from typing import Any from typing import Any
from .models import NewsItem from .models import NewsItem, PublishedUrlEntry, PublishedUrls
TITLE_SIMILARITY_THRESHOLD = 0.50
TOKEN_JACCARD_THRESHOLD = 0.40
TOKEN_EDIT_DISTANCE_THRESHOLD = 0.40
def _item_score(item: NewsItem) -> int: def _item_score(item: NewsItem) -> int:
@@ -52,6 +59,18 @@ def _group_by_key(items: list[NewsItem], key_name: str) -> dict[str, list[NewsIt
return {key: group for key, group in groups.items() if len(group) > 1} return {key: group for key, group in groups.items() if len(group) > 1}
def _title_tokens(value: str) -> set[str]:
if not value:
return set()
return set(re.findall(r"[a-z0-9]+|[\u4e00-\u9fff]", value.lower()))
def _jaccard_similarity(left: set[str], right: set[str]) -> float:
if not left or not right:
return 0.0
return len(left & right) / len(left | right)
def _possible_duplicates(items: list[NewsItem]) -> list[dict[str, Any]]: def _possible_duplicates(items: list[NewsItem]) -> list[dict[str, Any]]:
possible: list[dict[str, Any]] = [] possible: list[dict[str, Any]] = []
for index, left in enumerate(items): for index, left in enumerate(items):
@@ -59,12 +78,16 @@ def _possible_duplicates(items: list[NewsItem]) -> list[dict[str, Any]]:
if not left.title_norm or not right.title_norm: if not left.title_norm or not right.title_norm:
continue continue
ratio = difflib.SequenceMatcher(None, left.title_norm, right.title_norm).ratio() ratio = difflib.SequenceMatcher(None, left.title_norm, right.title_norm).ratio()
if ratio >= 0.65: jaccard = _jaccard_similarity(_title_tokens(left.title_norm), _title_tokens(right.title_norm))
if ratio >= TITLE_SIMILARITY_THRESHOLD or (
ratio >= TOKEN_EDIT_DISTANCE_THRESHOLD and jaccard >= TOKEN_JACCARD_THRESHOLD
):
possible.append( possible.append(
{ {
"item_ids": [left.id, right.id], "item_ids": [left.id, right.id],
"reason": "title_similarity", "reason": "title_similarity",
"similarity": round(ratio, 3), "similarity": round(ratio, 3),
"token_jaccard": round(jaccard, 3),
"confidence": "medium", "confidence": "medium",
} }
) )
@@ -98,3 +121,62 @@ def hard_dedup_items(items: list[NewsItem]) -> tuple[list[NewsItem], dict[str, A
"possible_duplicates": _possible_duplicates(deduped), "possible_duplicates": _possible_duplicates(deduped),
} }
return deduped, report return deduped, report
def _parse_date(value: str | None) -> date | None:
if not value:
return None
text = value.strip()
try:
return date.fromisoformat(text[:10])
except ValueError:
try:
return datetime.fromisoformat(text).date()
except ValueError:
return None
def _entry_within_window(entry: PublishedUrlEntry, *, run_date: str, max_age_days: int) -> bool:
if max_age_days < 0:
return True
current = _parse_date(run_date)
previous = _parse_date(entry.last_published) or _parse_date(entry.first_seen)
if current is None or previous is None:
return True
return (current - previous).days <= max_age_days
def cross_day_dedup_items(
items: list[NewsItem],
published_urls: PublishedUrls | None,
*,
run_date: str,
max_age_days: int = 7,
) -> tuple[list[NewsItem], dict[str, Any]]:
history = published_urls or PublishedUrls()
deduped: list[NewsItem] = []
removed: list[dict[str, Any]] = []
for item in items:
entry = history.urls.get(item.canonical_url) if item.canonical_url else None
if entry and _entry_within_window(entry, run_date=run_date, max_age_days=max_age_days):
removed.append(
{
"item_id": item.id,
"canonical_url": item.canonical_url,
"title": item.title or item.title_raw,
"first_seen": entry.first_seen,
"last_published": entry.last_published,
}
)
continue
deduped.append(item)
report = {
"input_count": len(items),
"output_count": len(deduped),
"removed_count": len(removed),
"removed": removed,
"max_age_days": max_age_days,
}
return deduped, report

View File

@@ -14,6 +14,7 @@ class SourceConfig:
retries: int = 0 retries: int = 0
min_items: int = 0 min_items: int = 0
url: str = "" url: str = ""
max_item_age_days: int | None = None
@dataclass @dataclass
@@ -51,3 +52,17 @@ class NewsItem:
section: str | None = None section: str | None = None
quality_flags: list[str] = field(default_factory=list) quality_flags: list[str] = field(default_factory=list)
duplicate_sources: list[dict[str, Any]] = field(default_factory=list) duplicate_sources: list[dict[str, Any]] = field(default_factory=list)
@dataclass
class PublishedUrlEntry:
first_seen: str
last_published: str
titles: list[str] = field(default_factory=list)
@dataclass
class PublishedUrls:
version: int = 1
urls: dict[str, PublishedUrlEntry] = field(default_factory=dict)
updated_at: str = ""

View File

@@ -5,9 +5,9 @@ from typing import Any
from .assemble import assemble_markdown from .assemble import assemble_markdown
from .classify import classify_and_order_items from .classify import classify_and_order_items
from .collect import Fetcher, collect_sources from .collect import Fetcher, collect_sources
from .dedupe import hard_dedup_items from .dedupe import cross_day_dedup_items, hard_dedup_items
from .guide import GuideLlmCall, generate_guide from .guide import GuideLlmCall, generate_guide
from .models import SourceConfig from .models import PublishedUrls, SourceConfig
from .normalize import normalize_items from .normalize import normalize_items
from .publish import BlogClient, publish_markdown from .publish import BlogClient, publish_markdown
from .rewrite import RewriteLlmCall, rewrite_items from .rewrite import RewriteLlmCall, rewrite_items
@@ -15,6 +15,7 @@ from .semantic_dedupe import SemanticLlmCall, semantic_dedup_items
def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig: def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig:
max_item_age_days = value.get("max_item_age_days")
return SourceConfig( return SourceConfig(
name=value["name"], name=value["name"],
type=value["type"], type=value["type"],
@@ -26,6 +27,7 @@ def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig:
retries=int(value.get("retries", 0)), retries=int(value.get("retries", 0)),
min_items=int(value.get("min_items", 0)), min_items=int(value.get("min_items", 0)),
url=value.get("url", ""), url=value.get("url", ""),
max_item_age_days=int(max_item_age_days) if max_item_age_days is not None else None,
) )
@@ -58,6 +60,43 @@ def run_stage0_to_stage2(
} }
def run_stage0_to_stage2_5(
source_configs: list[dict[str, Any] | SourceConfig],
run_date: str,
*,
fetcher: Fetcher,
published_urls: PublishedUrls | None = None,
cross_day_dedup_enabled: bool = True,
cross_day_dedup_max_age_days: int = 7,
) -> dict[str, Any]:
stage2_result = run_stage0_to_stage2(source_configs, run_date, fetcher=fetcher)
if cross_day_dedup_enabled:
items, stage2_5_report = cross_day_dedup_items(
stage2_result["items"],
published_urls,
run_date=run_date,
max_age_days=cross_day_dedup_max_age_days,
)
else:
items = stage2_result["items"]
stage2_5_report = {
"input_count": len(items),
"output_count": len(items),
"removed_count": 0,
"removed": [],
"enabled": False,
"max_age_days": cross_day_dedup_max_age_days,
}
reports = dict(stage2_result["reports"])
stage2_5_report.setdefault("enabled", cross_day_dedup_enabled)
reports["stage2_5"] = stage2_5_report
return {
"source_results": stage2_result["source_results"],
"items": items,
"reports": reports,
}
def run_stage0_to_stage4( def run_stage0_to_stage4(
source_configs: list[dict[str, Any] | SourceConfig], source_configs: list[dict[str, Any] | SourceConfig],
run_date: str, run_date: str,
@@ -65,10 +104,25 @@ def run_stage0_to_stage4(
fetcher: Fetcher, fetcher: Fetcher,
semantic_llm_call: SemanticLlmCall, semantic_llm_call: SemanticLlmCall,
rewrite_llm_call: RewriteLlmCall, rewrite_llm_call: RewriteLlmCall,
published_urls: PublishedUrls | None = None,
cross_day_dedup_enabled: bool = True,
cross_day_dedup_max_age_days: int = 7,
) -> dict[str, Any]: ) -> dict[str, Any]:
stage2_result = run_stage0_to_stage2(source_configs, run_date, fetcher=fetcher) stage2_5_result = run_stage0_to_stage2_5(
items = stage2_result["items"] source_configs,
candidates = stage2_result["reports"]["stage2"].get("possible_duplicates", []) run_date,
fetcher=fetcher,
published_urls=published_urls,
cross_day_dedup_enabled=cross_day_dedup_enabled,
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
)
items = stage2_5_result["items"]
remaining_ids = {item.id for item in items}
candidates = [
candidate
for candidate in stage2_5_result["reports"]["stage2"].get("possible_duplicates", [])
if set(candidate.get("item_ids", [])).issubset(remaining_ids)
]
semantic_items, stage3_report = semantic_dedup_items( semantic_items, stage3_report = semantic_dedup_items(
items, items,
candidates, candidates,
@@ -78,11 +132,11 @@ def run_stage0_to_stage4(
semantic_items, semantic_items,
llm_call=rewrite_llm_call, llm_call=rewrite_llm_call,
) )
reports = dict(stage2_result["reports"]) reports = dict(stage2_5_result["reports"])
reports["stage3"] = stage3_report reports["stage3"] = stage3_report
reports["stage4"] = stage4_report reports["stage4"] = stage4_report
return { return {
"source_results": stage2_result["source_results"], "source_results": stage2_5_result["source_results"],
"items": rewritten_items, "items": rewritten_items,
"reports": reports, "reports": reports,
} }
@@ -95,6 +149,9 @@ def run_stage0_to_stage5(
fetcher: Fetcher, fetcher: Fetcher,
semantic_llm_call: SemanticLlmCall, semantic_llm_call: SemanticLlmCall,
rewrite_llm_call: RewriteLlmCall, rewrite_llm_call: RewriteLlmCall,
published_urls: PublishedUrls | None = None,
cross_day_dedup_enabled: bool = True,
cross_day_dedup_max_age_days: int = 7,
) -> dict[str, Any]: ) -> dict[str, Any]:
stage4_result = run_stage0_to_stage4( stage4_result = run_stage0_to_stage4(
source_configs, source_configs,
@@ -102,6 +159,9 @@ def run_stage0_to_stage5(
fetcher=fetcher, fetcher=fetcher,
semantic_llm_call=semantic_llm_call, semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call, rewrite_llm_call=rewrite_llm_call,
published_urls=published_urls,
cross_day_dedup_enabled=cross_day_dedup_enabled,
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
) )
classified_items, stage5_report = classify_and_order_items(stage4_result["items"]) classified_items, stage5_report = classify_and_order_items(stage4_result["items"])
reports = dict(stage4_result["reports"]) reports = dict(stage4_result["reports"])
@@ -121,6 +181,9 @@ def run_stage0_to_stage6(
semantic_llm_call: SemanticLlmCall, semantic_llm_call: SemanticLlmCall,
rewrite_llm_call: RewriteLlmCall, rewrite_llm_call: RewriteLlmCall,
guide_llm_call: GuideLlmCall, guide_llm_call: GuideLlmCall,
published_urls: PublishedUrls | None = None,
cross_day_dedup_enabled: bool = True,
cross_day_dedup_max_age_days: int = 7,
) -> dict[str, Any]: ) -> dict[str, Any]:
stage5_result = run_stage0_to_stage5( stage5_result = run_stage0_to_stage5(
source_configs, source_configs,
@@ -128,6 +191,9 @@ def run_stage0_to_stage6(
fetcher=fetcher, fetcher=fetcher,
semantic_llm_call=semantic_llm_call, semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call, rewrite_llm_call=rewrite_llm_call,
published_urls=published_urls,
cross_day_dedup_enabled=cross_day_dedup_enabled,
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
) )
guide, stage6_report = generate_guide(stage5_result["items"], llm_call=guide_llm_call) guide, stage6_report = generate_guide(stage5_result["items"], llm_call=guide_llm_call)
reports = dict(stage5_result["reports"]) reports = dict(stage5_result["reports"])
@@ -148,6 +214,9 @@ def run_stage0_to_stage7(
semantic_llm_call: SemanticLlmCall, semantic_llm_call: SemanticLlmCall,
rewrite_llm_call: RewriteLlmCall, rewrite_llm_call: RewriteLlmCall,
guide_llm_call: GuideLlmCall, guide_llm_call: GuideLlmCall,
published_urls: PublishedUrls | None = None,
cross_day_dedup_enabled: bool = True,
cross_day_dedup_max_age_days: int = 7,
) -> dict[str, Any]: ) -> dict[str, Any]:
stage6_result = run_stage0_to_stage6( stage6_result = run_stage0_to_stage6(
source_configs, source_configs,
@@ -156,6 +225,9 @@ def run_stage0_to_stage7(
semantic_llm_call=semantic_llm_call, semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call, rewrite_llm_call=rewrite_llm_call,
guide_llm_call=guide_llm_call, guide_llm_call=guide_llm_call,
published_urls=published_urls,
cross_day_dedup_enabled=cross_day_dedup_enabled,
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
) )
markdown, stage7_report = assemble_markdown(stage6_result["items"], stage6_result["guide"]) markdown, stage7_report = assemble_markdown(stage6_result["items"], stage6_result["guide"])
upstream_blocking_errors: list[str] = [] upstream_blocking_errors: list[str] = []
@@ -187,6 +259,9 @@ def run_stage0_to_stage8(
mode: str, mode: str,
base_url: str, base_url: str,
client: BlogClient | None, client: BlogClient | None,
published_urls: PublishedUrls | None = None,
cross_day_dedup_enabled: bool = True,
cross_day_dedup_max_age_days: int = 7,
) -> dict[str, Any]: ) -> dict[str, Any]:
stage7_result = run_stage0_to_stage7( stage7_result = run_stage0_to_stage7(
source_configs, source_configs,
@@ -195,6 +270,9 @@ def run_stage0_to_stage8(
semantic_llm_call=semantic_llm_call, semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call, rewrite_llm_call=rewrite_llm_call,
guide_llm_call=guide_llm_call, guide_llm_call=guide_llm_call,
published_urls=published_urls,
cross_day_dedup_enabled=cross_day_dedup_enabled,
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
) )
slug = f"ai-{run_date}" slug = f"ai-{run_date}"
publish_result = publish_markdown( publish_result = publish_markdown(

View File

@@ -1,8 +1,13 @@
from __future__ import annotations from __future__ import annotations
import json
from dataclasses import dataclass from dataclasses import dataclass
from datetime import date, datetime, timezone
from pathlib import Path
from typing import Any, Protocol from typing import Any, Protocol
from .models import NewsItem, PublishedUrlEntry, PublishedUrls
@dataclass @dataclass
class PublishResult: class PublishResult:
@@ -22,6 +27,122 @@ class BlogClient(Protocol):
... ...
def _parse_date(value: str | None) -> date | None:
if not value:
return None
text = value.strip()
try:
return date.fromisoformat(text[:10])
except ValueError:
try:
return datetime.fromisoformat(text).date()
except ValueError:
return None
def _published_entry_from_dict(value: Any) -> PublishedUrlEntry | None:
if not isinstance(value, dict):
return None
first_seen = str(value.get("first_seen") or "")
last_published = str(value.get("last_published") or first_seen)
titles = [str(title) for title in value.get("titles", []) or [] if str(title)]
if not first_seen and not last_published:
return None
return PublishedUrlEntry(
first_seen=first_seen or last_published,
last_published=last_published or first_seen,
titles=titles,
)
def load_published_urls(path: Path) -> PublishedUrls:
if not path.exists():
return PublishedUrls()
try:
raw = json.loads(path.read_text(encoding="utf-8"))
except Exception:
return PublishedUrls()
if not isinstance(raw, dict):
return PublishedUrls()
urls: dict[str, PublishedUrlEntry] = {}
for canonical_url, value in (raw.get("urls") or {}).items():
if not canonical_url:
continue
entry = _published_entry_from_dict(value)
if entry is not None:
urls[str(canonical_url)] = entry
return PublishedUrls(
version=int(raw.get("version") or 1),
urls=urls,
updated_at=str(raw.get("updated_at") or ""),
)
def _entry_within_window(entry: PublishedUrlEntry, *, run_date: str, max_age_days: int) -> bool:
if max_age_days < 0:
return True
current = _parse_date(run_date)
previous = _parse_date(entry.last_published) or _parse_date(entry.first_seen)
if current is None or previous is None:
return True
return (current - previous).days <= max_age_days
def _published_urls_to_dict(history: PublishedUrls) -> dict[str, Any]:
return {
"version": history.version,
"urls": {
canonical_url: {
"first_seen": entry.first_seen,
"last_published": entry.last_published,
"titles": entry.titles,
}
for canonical_url, entry in sorted(history.urls.items())
},
"updated_at": history.updated_at,
}
def update_published_urls(
path: Path,
items: list[NewsItem],
*,
run_date: str,
max_age_days: int = 7,
) -> PublishedUrls:
history = load_published_urls(path)
history.urls = {
canonical_url: entry
for canonical_url, entry in history.urls.items()
if _entry_within_window(entry, run_date=run_date, max_age_days=max_age_days)
}
for item in items:
if not item.canonical_url:
continue
title = item.title or item.title_raw
entry = history.urls.get(item.canonical_url)
if entry is None:
entry = PublishedUrlEntry(
first_seen=run_date,
last_published=run_date,
titles=[],
)
history.urls[item.canonical_url] = entry
entry.last_published = run_date
if title and title not in entry.titles:
entry.titles.append(title)
history.updated_at = datetime.now(timezone.utc).isoformat()
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(
json.dumps(_published_urls_to_dict(history), ensure_ascii=False, indent=2),
encoding="utf-8",
)
return history
def dry_run_publish(slug: str, base_url: str) -> PublishResult: def dry_run_publish(slug: str, base_url: str) -> PublishResult:
return PublishResult( return PublishResult(
mode="dry-run", mode="dry-run",

View File

@@ -6,10 +6,11 @@ from pathlib import Path
from typing import Any from typing import Any
from .clients import BlogApiClient, OpenAICompatibleClient, fetch_text as default_fetch_text from .clients import BlogApiClient, OpenAICompatibleClient, fetch_text as default_fetch_text
from .config import load_source_configs from .config import load_pipeline_config, load_source_configs
from .env import load_env, resolve_blog_token, resolve_llm_config from .env import load_env, resolve_blog_token, resolve_llm_config
from .models import SourceConfig from .models import SourceConfig
from .pipeline import run_stage0_to_stage8 from .pipeline import run_stage0_to_stage8
from .publish import load_published_urls, update_published_urls
from .sources.registry import get_source_fetcher from .sources.registry import get_source_fetcher
@@ -89,6 +90,8 @@ def run_daily_report(
out_dir: Path, out_dir: Path,
base_url: str, base_url: str,
sources_path: Path | None = None, sources_path: Path | None = None,
pipeline_path: Path | None = None,
history_path: Path | None = None,
fetch_text=None, fetch_text=None,
env: dict[str, str] | None = None, env: dict[str, str] | None = None,
llm_client_factory=OpenAICompatibleClient, llm_client_factory=OpenAICompatibleClient,
@@ -96,6 +99,15 @@ def run_daily_report(
) -> dict[str, Any]: ) -> dict[str, Any]:
fetch_text = fetch_text or default_fetch_text fetch_text = fetch_text or default_fetch_text
env = env if env is not None else load_env() env = env if env is not None else load_env()
pipeline_config_path = pipeline_path or Path("config") / "pipeline.json"
pipeline_config = load_pipeline_config(pipeline_config_path)
cross_day_config = pipeline_config.get("cross_day_dedup", {}) or {}
cross_day_enabled = bool(cross_day_config.get("enabled", True))
cross_day_max_age_days = int(cross_day_config.get("max_age_days", 7))
configured_history_path = history_path or Path(
str(cross_day_config.get("history_path") or "~/.hermes/scripts/ai_morning_out/published_urls.json")
).expanduser()
published_urls = load_published_urls(configured_history_path) if cross_day_enabled else None
if source_mode == "mock": if source_mode == "mock":
source_configs = _mock_source_configs() source_configs = _mock_source_configs()
@@ -141,8 +153,19 @@ def run_daily_report(
mode=mode, mode=mode,
base_url=base_url, base_url=base_url,
client=blog_client, client=blog_client,
published_urls=published_urls,
cross_day_dedup_enabled=cross_day_enabled,
cross_day_dedup_max_age_days=cross_day_max_age_days,
) )
if cross_day_enabled and result["publish"].mode == "publish" and result["publish"].status == "ok":
update_published_urls(
configured_history_path,
result["items"],
run_date=run_date,
max_age_days=cross_day_max_age_days,
)
run_dir = out_dir / run_date run_dir = out_dir / run_date
run_dir.mkdir(parents=True, exist_ok=True) run_dir.mkdir(parents=True, exist_ok=True)
(run_dir / "blog_markdown.md").write_text(result["markdown"], encoding="utf-8") (run_dir / "blog_markdown.md").write_text(result["markdown"], encoding="utf-8")

View File

@@ -1,6 +1,7 @@
from __future__ import annotations from __future__ import annotations
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from datetime import date, datetime
from email.utils import parsedate_to_datetime from email.utils import parsedate_to_datetime
from typing import Any, Callable from typing import Any, Callable
@@ -20,16 +21,57 @@ def _parse_pubdate(value: str) -> str | None:
return None return None
def parse_rss_items(config: SourceConfig, xml_text: str, *, limit: int = 20) -> list[dict[str, Any]]: def _parse_run_date(value: str | None) -> date | None:
if not value:
return None
try:
return date.fromisoformat(value[:10])
except ValueError:
return None
def _parse_iso_date(value: str | None) -> date | None:
if not value:
return None
try:
return datetime.fromisoformat(value).date()
except ValueError:
return None
def _within_max_item_age(published_at: str | None, *, run_date: str | None, max_item_age_days: int | None) -> bool:
if max_item_age_days is None:
return True
published_date = _parse_iso_date(published_at)
current_date = _parse_run_date(run_date)
if published_date is None or current_date is None:
return True
return (current_date - published_date).days <= max_item_age_days
def parse_rss_items(
config: SourceConfig,
xml_text: str,
*,
limit: int = 20,
run_date: str | None = None,
) -> list[dict[str, Any]]:
root = ET.fromstring(xml_text) root = ET.fromstring(xml_text)
channel = root.find("channel") channel = root.find("channel")
raw_items = channel.findall("item") if channel is not None else [] raw_items = channel.findall("item") if channel is not None else []
items: list[dict[str, Any]] = [] items: list[dict[str, Any]] = []
for raw in raw_items[:limit]: for raw in raw_items:
title = clean_text(raw.findtext("title") or "") title = clean_text(raw.findtext("title") or "")
if not title: if not title:
continue continue
summary = clean_text(raw.findtext("description") or "") summary = clean_text(raw.findtext("description") or "")
published_at = _parse_pubdate(raw.findtext("pubDate") or "")
if not _within_max_item_age(
published_at,
run_date=run_date,
max_item_age_days=config.max_item_age_days,
):
continue
items.append( items.append(
{ {
"source_group": config.name, "source_group": config.name,
@@ -37,15 +79,16 @@ def parse_rss_items(config: SourceConfig, xml_text: str, *, limit: int = 20) ->
"title_raw": title, "title_raw": title,
"summary_raw": summary, "summary_raw": summary,
"url": (raw.findtext("link") or "").strip(), "url": (raw.findtext("link") or "").strip(),
"published_at": _parse_pubdate(raw.findtext("pubDate") or ""), "published_at": published_at,
"origin_type": "rss", "origin_type": "rss",
"section_hint": "", "section_hint": "",
"language_hint": "en" if title.encode("utf-8").isascii() else "zh", "language_hint": "en" if title.encode("utf-8").isascii() else "zh",
} }
) )
if len(items) >= limit:
break
return items return items
def fetch_rss(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]: def fetch_rss(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
return parse_rss_items(config, fetch_text(config.url, config.timeout_seconds)) return parse_rss_items(config, fetch_text(config.url, config.timeout_seconds), run_date=run_date)

View File

@@ -11,6 +11,10 @@
], ],
"rewrite_batch_size": 10, "rewrite_batch_size": 10,
"semantic_dedup_max_deletion_ratio": 0.5, "semantic_dedup_max_deletion_ratio": 0.5,
"default_mode": "dry-run" "default_mode": "dry-run",
"cross_day_dedup": {
"enabled": true,
"max_age_days": 7,
"history_path": "~/.hermes/scripts/ai_morning_out/published_urls.json"
}
} }

View File

@@ -19,6 +19,7 @@
"priority": 40, "priority": 40,
"timeout_seconds": 25, "timeout_seconds": 25,
"retries": 1, "retries": 1,
"max_item_age_days": 3,
"enabled": true "enabled": true
}, },
{ {
@@ -30,6 +31,7 @@
"priority": 50, "priority": 50,
"timeout_seconds": 25, "timeout_seconds": 25,
"retries": 1, "retries": 1,
"max_item_age_days": 5,
"enabled": true "enabled": true
}, },
{ {
@@ -55,4 +57,3 @@
"enabled": true "enabled": true
} }
] ]

View File

@@ -16,6 +16,12 @@ class ConfigLoadingTests(unittest.TestCase):
self.assertEqual(configs[0].name, "AI HOT") self.assertEqual(configs[0].name, "AI HOT")
self.assertEqual(configs[0].type, "aihot") self.assertEqual(configs[0].type, "aihot")
def test_rss_configs_can_set_max_item_age_days(self):
configs = load_source_configs(ROOT / "config" / "sources.json")
by_name = {config.name: config for config in configs}
self.assertEqual(by_name["InfoQ AI"].max_item_age_days, 3)
def test_all_configured_source_types_are_registered(self): def test_all_configured_source_types_are_registered(self):
configs = load_source_configs(ROOT / "config" / "sources.json") configs = load_source_configs(ROOT / "config" / "sources.json")

58
tests/test_rss.py Normal file
View File

@@ -0,0 +1,58 @@
import unittest
from ai_daily_report.models import SourceConfig
from ai_daily_report.sources.rss import parse_rss_items
class RssSourceTests(unittest.TestCase):
def test_parse_rss_items_filters_entries_older_than_configured_age(self):
config = SourceConfig(
name="InfoQ AI",
type="rss",
url="https://feed.example/rss",
max_item_age_days=3,
)
xml = """<?xml version="1.0"?>
<rss><channel>
<item>
<title>Fresh item</title>
<link>https://example.com/fresh</link>
<description>Fresh summary</description>
<pubDate>Sun, 07 Jun 2026 06:25:00 GMT</pubDate>
</item>
<item>
<title>Old item</title>
<link>https://example.com/old</link>
<description>Old summary</description>
<pubDate>Mon, 01 Jun 2026 06:25:00 GMT</pubDate>
</item>
</channel></rss>"""
items = parse_rss_items(config, xml, run_date="2026-06-08")
self.assertEqual([item["title_raw"] for item in items], ["Fresh item"])
def test_parse_rss_items_keeps_unparseable_dates_to_avoid_false_drops(self):
config = SourceConfig(
name="InfoQ AI",
type="rss",
url="https://feed.example/rss",
max_item_age_days=3,
)
xml = """<?xml version="1.0"?>
<rss><channel>
<item>
<title>No date item</title>
<link>https://example.com/no-date</link>
<description>No date summary</description>
<pubDate>not a date</pubDate>
</item>
</channel></rss>"""
items = parse_rss_items(config, xml, run_date="2026-06-08")
self.assertEqual([item["title_raw"] for item in items], ["No date item"])
if __name__ == "__main__":
unittest.main()

View File

@@ -3,6 +3,7 @@ import json
from pathlib import Path from pathlib import Path
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
from ai_daily_report.publish import load_published_urls
from ai_daily_report.runner import run_daily_report from ai_daily_report.runner import run_daily_report
@@ -127,6 +128,36 @@ class RunnerTests(unittest.TestCase):
self.assertGreaterEqual(len(fake_client.prompts), 2) self.assertGreaterEqual(len(fake_client.prompts), 2)
self.assertEqual(result["reports"]["stage8"]["status"], "ok") self.assertEqual(result["reports"]["stage8"]["status"], "ok")
def test_run_daily_report_publish_updates_published_url_history(self):
class FakeBlogClient:
def __init__(self, **kwargs):
self.kwargs = kwargs
def create_post(self, payload):
return {"slug": payload["slug"]}
def publish_post(self, slug):
self.slug = slug
with TemporaryDirectory() as temp_dir:
history_path = Path(temp_dir) / "published_urls.json"
result = run_daily_report(
run_date="2026-06-08",
mode="publish",
source_mode="mock",
llm_mode="mock",
out_dir=Path(temp_dir) / "out",
base_url="https://blog.example",
env={"BLOG_SERVICE_TOKEN": "token"},
blog_client_factory=FakeBlogClient,
history_path=history_path,
)
history = load_published_urls(history_path)
self.assertEqual(result["reports"]["stage8"]["status"], "ok")
self.assertIn("https://example.com/gpt5", history.urls)
self.assertEqual(history.urls["https://example.com/gpt5"].last_published, "2026-06-08")
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View File

@@ -2,6 +2,7 @@ import json
import unittest import unittest
from ai_daily_report.pipeline import run_stage0_to_stage4 from ai_daily_report.pipeline import run_stage0_to_stage4
from ai_daily_report.models import PublishedUrlEntry, PublishedUrls
class Stage0To4PipelineTests(unittest.TestCase): class Stage0To4PipelineTests(unittest.TestCase):
@@ -61,6 +62,71 @@ class Stage0To4PipelineTests(unittest.TestCase):
self.assertIn("stage4", result["reports"]) self.assertIn("stage4", result["reports"])
self.assertEqual(result["reports"]["stage4"]["rewritten_count"], 2) self.assertEqual(result["reports"]["stage4"]["rewritten_count"], 2)
def test_run_stage0_to_stage4_filters_published_urls_before_semantic_dedupe(self):
configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}]
seen_semantic_payloads = []
seen_rewrite_payloads = []
def fetcher(config, run_date):
return [
{
"title_raw": "Already published",
"summary_raw": "Old summary",
"url": "https://example.com/already",
"source_label": config.name,
},
{
"title_raw": "Fresh story",
"summary_raw": "Fresh summary",
"url": "https://example.com/fresh",
"source_label": config.name,
},
]
def semantic_llm_call(prompt):
seen_semantic_payloads.append(json.loads(prompt))
return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
def rewrite_llm_call(prompt):
payload = json.loads(prompt)
seen_rewrite_payloads.append(payload)
return json.dumps(
{
"rewrites": [
{
"id": entry["id"],
"title": entry["title_raw"],
"summary": entry["summary_raw"],
"flags": [],
}
for entry in payload["items"]
]
}
)
published_urls = PublishedUrls(
urls={
"https://example.com/already": PublishedUrlEntry(
first_seen="2026-06-07",
last_published="2026-06-07",
titles=["Already published"],
)
}
)
result = run_stage0_to_stage4(
configs,
"2026-06-08",
fetcher=fetcher,
semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call,
published_urls=published_urls,
)
self.assertEqual([entry.title_raw for entry in result["items"]], ["Fresh story"])
self.assertEqual(result["reports"]["stage2_5"]["removed_count"], 1)
self.assertEqual([entry["title_raw"] for entry in seen_rewrite_payloads[0]["items"]], ["Fresh story"])
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View File

@@ -1,7 +1,7 @@
import unittest import unittest
from ai_daily_report.dedupe import hard_dedup_items from ai_daily_report.dedupe import cross_day_dedup_items, hard_dedup_items
from ai_daily_report.models import NewsItem from ai_daily_report.models import NewsItem, PublishedUrlEntry, PublishedUrls
def item( def item(
@@ -58,6 +58,72 @@ class Stage2DedupeTests(unittest.TestCase):
self.assertEqual(len(report["possible_duplicates"]), 1) self.assertEqual(len(report["possible_duplicates"]), 1)
self.assertEqual(set(report["possible_duplicates"][0]["item_ids"]), {"a", "b"}) self.assertEqual(set(report["possible_duplicates"][0]["item_ids"]), {"a", "b"})
def test_hard_dedup_marks_lower_similarity_mixed_language_titles_as_candidates(self):
items = [
item("a", "OpenAI custom chip lead Clive Chan joins Anthropic", "openai定制芯片核心成员clivechan跳槽至anthropic", "https://example.com/a", "https://example.com/a"),
item("b", "OpenAI chip core member defects to Anthropic before mass production", "openai芯片核心叛逃anthropic就在量产前夜", "https://example.com/b", "https://example.com/b"),
]
deduped, report = hard_dedup_items(items)
self.assertEqual(len(deduped), 2)
self.assertEqual(report["removed_count"], 0)
self.assertEqual(len(report["possible_duplicates"]), 1)
self.assertEqual(set(report["possible_duplicates"][0]["item_ids"]), {"a", "b"})
def test_cross_day_dedup_filters_recently_published_canonical_urls_only(self):
items = [
item("old", "Old URL", "oldurl", "https://example.com/old", "https://example.com/old"),
item("new", "New URL", "newurl", "https://example.com/new", "https://example.com/new"),
item("missing", "Missing URL", "missingurl", "", ""),
]
published_urls = PublishedUrls(
urls={
"https://example.com/old": PublishedUrlEntry(
first_seen="2026-06-07",
last_published="2026-06-07",
titles=["Old URL"],
)
}
)
deduped, report = cross_day_dedup_items(
items,
published_urls,
run_date="2026-06-08",
max_age_days=7,
)
self.assertEqual([entry.id for entry in deduped], ["new", "missing"])
self.assertEqual(report["input_count"], 3)
self.assertEqual(report["output_count"], 2)
self.assertEqual(report["removed_count"], 1)
self.assertEqual(report["removed"][0]["item_id"], "old")
def test_cross_day_dedup_ignores_urls_outside_history_window(self):
items = [
item("stale", "Stale URL", "staleurl", "https://example.com/stale", "https://example.com/stale"),
]
published_urls = PublishedUrls(
urls={
"https://example.com/stale": PublishedUrlEntry(
first_seen="2026-05-01",
last_published="2026-05-01",
titles=["Stale URL"],
)
}
)
deduped, report = cross_day_dedup_items(
items,
published_urls,
run_date="2026-06-08",
max_age_days=7,
)
self.assertEqual([entry.id for entry in deduped], ["stale"])
self.assertEqual(report["removed_count"], 0)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View File

@@ -1,6 +1,9 @@
import unittest import unittest
from pathlib import Path
from tempfile import TemporaryDirectory
from ai_daily_report.publish import publish_markdown from ai_daily_report.models import NewsItem
from ai_daily_report.publish import load_published_urls, publish_markdown, update_published_urls
class FakeBlogClient: class FakeBlogClient:
@@ -71,6 +74,46 @@ class Stage8PublishTests(unittest.TestCase):
self.assertEqual(client.published_slug, "ai-2026-06-04") self.assertEqual(client.published_slug, "ai-2026-06-04")
self.assertEqual(result.blog_url, "https://blog.example/posts/ai-2026-06-04") self.assertEqual(result.blog_url, "https://blog.example/posts/ai-2026-06-04")
def test_update_published_urls_writes_canonical_urls_for_final_items(self):
with TemporaryDirectory() as temp_dir:
history_path = Path(temp_dir) / "published_urls.json"
items = [
NewsItem(
id="a",
source_group="AI HOT",
source_label="AI HOT",
source_role="primary",
source_priority=10,
title_raw="Fresh story",
title_norm="freshstory",
summary_raw="summary",
url="https://example.com/fresh?utm_source=x",
canonical_url="https://example.com/fresh",
title="Fresh story",
),
NewsItem(
id="missing",
source_group="AI HOT",
source_label="AI HOT",
source_role="primary",
source_priority=10,
title_raw="Missing URL",
title_norm="missingurl",
summary_raw="summary",
url="",
canonical_url="",
),
]
update_published_urls(history_path, items, run_date="2026-06-08", max_age_days=7)
loaded = load_published_urls(history_path)
self.assertIn("https://example.com/fresh", loaded.urls)
self.assertNotIn("", loaded.urls)
self.assertEqual(loaded.urls["https://example.com/fresh"].first_seen, "2026-06-08")
self.assertEqual(loaded.urls["https://example.com/fresh"].last_published, "2026-06-08")
self.assertEqual(loaded.urls["https://example.com/fresh"].titles, ["Fresh story"])
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()