fix: add cross-day dedupe

This commit is contained in:
Mimikko-zeus
2026-06-08 12:05:45 +08:00
parent 2671aee850
commit 07786e3bc0
16 changed files with 671 additions and 21 deletions

View File

@@ -17,6 +17,8 @@ def build_parser() -> argparse.ArgumentParser:
run.add_argument("--out-dir", default="runs")
run.add_argument("--base-url", default="https://blog.ephron.ren")
run.add_argument("--sources-path", default=None)
run.add_argument("--pipeline-path", default=None)
run.add_argument("--history-path", default=None)
return parser
@@ -32,6 +34,8 @@ def main(argv: list[str] | None = None) -> int:
out_dir=Path(args.out_dir),
base_url=args.base_url,
sources_path=Path(args.sources_path) if args.sources_path else None,
pipeline_path=Path(args.pipeline_path) if args.pipeline_path else None,
history_path=Path(args.history_path) if args.history_path else None,
)
return 0

View File

@@ -17,3 +17,12 @@ def load_source_configs(path: Path) -> list[SourceConfig]:
if not isinstance(raw, list):
raise ValueError("sources config must be a list")
return [_source_config_from_dict(item) for item in raw]
def load_pipeline_config(path: Path) -> dict[str, Any]:
if not path.exists():
return {}
raw = load_json(path)
if not isinstance(raw, dict):
raise ValueError("pipeline config must be an object")
return raw

View File

@@ -1,9 +1,16 @@
from __future__ import annotations
import difflib
import re
from datetime import date, datetime
from typing import Any
from .models import NewsItem
from .models import NewsItem, PublishedUrlEntry, PublishedUrls
TITLE_SIMILARITY_THRESHOLD = 0.50
TOKEN_JACCARD_THRESHOLD = 0.40
TOKEN_EDIT_DISTANCE_THRESHOLD = 0.40
def _item_score(item: NewsItem) -> int:
@@ -52,6 +59,18 @@ def _group_by_key(items: list[NewsItem], key_name: str) -> dict[str, list[NewsIt
return {key: group for key, group in groups.items() if len(group) > 1}
def _title_tokens(value: str) -> set[str]:
if not value:
return set()
return set(re.findall(r"[a-z0-9]+|[\u4e00-\u9fff]", value.lower()))
def _jaccard_similarity(left: set[str], right: set[str]) -> float:
if not left or not right:
return 0.0
return len(left & right) / len(left | right)
def _possible_duplicates(items: list[NewsItem]) -> list[dict[str, Any]]:
possible: list[dict[str, Any]] = []
for index, left in enumerate(items):
@@ -59,12 +78,16 @@ def _possible_duplicates(items: list[NewsItem]) -> list[dict[str, Any]]:
if not left.title_norm or not right.title_norm:
continue
ratio = difflib.SequenceMatcher(None, left.title_norm, right.title_norm).ratio()
if ratio >= 0.65:
jaccard = _jaccard_similarity(_title_tokens(left.title_norm), _title_tokens(right.title_norm))
if ratio >= TITLE_SIMILARITY_THRESHOLD or (
ratio >= TOKEN_EDIT_DISTANCE_THRESHOLD and jaccard >= TOKEN_JACCARD_THRESHOLD
):
possible.append(
{
"item_ids": [left.id, right.id],
"reason": "title_similarity",
"similarity": round(ratio, 3),
"token_jaccard": round(jaccard, 3),
"confidence": "medium",
}
)
@@ -98,3 +121,62 @@ def hard_dedup_items(items: list[NewsItem]) -> tuple[list[NewsItem], dict[str, A
"possible_duplicates": _possible_duplicates(deduped),
}
return deduped, report
def _parse_date(value: str | None) -> date | None:
if not value:
return None
text = value.strip()
try:
return date.fromisoformat(text[:10])
except ValueError:
try:
return datetime.fromisoformat(text).date()
except ValueError:
return None
def _entry_within_window(entry: PublishedUrlEntry, *, run_date: str, max_age_days: int) -> bool:
if max_age_days < 0:
return True
current = _parse_date(run_date)
previous = _parse_date(entry.last_published) or _parse_date(entry.first_seen)
if current is None or previous is None:
return True
return (current - previous).days <= max_age_days
def cross_day_dedup_items(
items: list[NewsItem],
published_urls: PublishedUrls | None,
*,
run_date: str,
max_age_days: int = 7,
) -> tuple[list[NewsItem], dict[str, Any]]:
history = published_urls or PublishedUrls()
deduped: list[NewsItem] = []
removed: list[dict[str, Any]] = []
for item in items:
entry = history.urls.get(item.canonical_url) if item.canonical_url else None
if entry and _entry_within_window(entry, run_date=run_date, max_age_days=max_age_days):
removed.append(
{
"item_id": item.id,
"canonical_url": item.canonical_url,
"title": item.title or item.title_raw,
"first_seen": entry.first_seen,
"last_published": entry.last_published,
}
)
continue
deduped.append(item)
report = {
"input_count": len(items),
"output_count": len(deduped),
"removed_count": len(removed),
"removed": removed,
"max_age_days": max_age_days,
}
return deduped, report

View File

@@ -14,6 +14,7 @@ class SourceConfig:
retries: int = 0
min_items: int = 0
url: str = ""
max_item_age_days: int | None = None
@dataclass
@@ -51,3 +52,17 @@ class NewsItem:
section: str | None = None
quality_flags: list[str] = field(default_factory=list)
duplicate_sources: list[dict[str, Any]] = field(default_factory=list)
@dataclass
class PublishedUrlEntry:
first_seen: str
last_published: str
titles: list[str] = field(default_factory=list)
@dataclass
class PublishedUrls:
version: int = 1
urls: dict[str, PublishedUrlEntry] = field(default_factory=dict)
updated_at: str = ""

View File

@@ -5,9 +5,9 @@ from typing import Any
from .assemble import assemble_markdown
from .classify import classify_and_order_items
from .collect import Fetcher, collect_sources
from .dedupe import hard_dedup_items
from .dedupe import cross_day_dedup_items, hard_dedup_items
from .guide import GuideLlmCall, generate_guide
from .models import SourceConfig
from .models import PublishedUrls, SourceConfig
from .normalize import normalize_items
from .publish import BlogClient, publish_markdown
from .rewrite import RewriteLlmCall, rewrite_items
@@ -15,6 +15,7 @@ from .semantic_dedupe import SemanticLlmCall, semantic_dedup_items
def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig:
max_item_age_days = value.get("max_item_age_days")
return SourceConfig(
name=value["name"],
type=value["type"],
@@ -26,6 +27,7 @@ def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig:
retries=int(value.get("retries", 0)),
min_items=int(value.get("min_items", 0)),
url=value.get("url", ""),
max_item_age_days=int(max_item_age_days) if max_item_age_days is not None else None,
)
@@ -58,6 +60,43 @@ def run_stage0_to_stage2(
}
def run_stage0_to_stage2_5(
source_configs: list[dict[str, Any] | SourceConfig],
run_date: str,
*,
fetcher: Fetcher,
published_urls: PublishedUrls | None = None,
cross_day_dedup_enabled: bool = True,
cross_day_dedup_max_age_days: int = 7,
) -> dict[str, Any]:
stage2_result = run_stage0_to_stage2(source_configs, run_date, fetcher=fetcher)
if cross_day_dedup_enabled:
items, stage2_5_report = cross_day_dedup_items(
stage2_result["items"],
published_urls,
run_date=run_date,
max_age_days=cross_day_dedup_max_age_days,
)
else:
items = stage2_result["items"]
stage2_5_report = {
"input_count": len(items),
"output_count": len(items),
"removed_count": 0,
"removed": [],
"enabled": False,
"max_age_days": cross_day_dedup_max_age_days,
}
reports = dict(stage2_result["reports"])
stage2_5_report.setdefault("enabled", cross_day_dedup_enabled)
reports["stage2_5"] = stage2_5_report
return {
"source_results": stage2_result["source_results"],
"items": items,
"reports": reports,
}
def run_stage0_to_stage4(
source_configs: list[dict[str, Any] | SourceConfig],
run_date: str,
@@ -65,10 +104,25 @@ def run_stage0_to_stage4(
fetcher: Fetcher,
semantic_llm_call: SemanticLlmCall,
rewrite_llm_call: RewriteLlmCall,
published_urls: PublishedUrls | None = None,
cross_day_dedup_enabled: bool = True,
cross_day_dedup_max_age_days: int = 7,
) -> dict[str, Any]:
stage2_result = run_stage0_to_stage2(source_configs, run_date, fetcher=fetcher)
items = stage2_result["items"]
candidates = stage2_result["reports"]["stage2"].get("possible_duplicates", [])
stage2_5_result = run_stage0_to_stage2_5(
source_configs,
run_date,
fetcher=fetcher,
published_urls=published_urls,
cross_day_dedup_enabled=cross_day_dedup_enabled,
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
)
items = stage2_5_result["items"]
remaining_ids = {item.id for item in items}
candidates = [
candidate
for candidate in stage2_5_result["reports"]["stage2"].get("possible_duplicates", [])
if set(candidate.get("item_ids", [])).issubset(remaining_ids)
]
semantic_items, stage3_report = semantic_dedup_items(
items,
candidates,
@@ -78,11 +132,11 @@ def run_stage0_to_stage4(
semantic_items,
llm_call=rewrite_llm_call,
)
reports = dict(stage2_result["reports"])
reports = dict(stage2_5_result["reports"])
reports["stage3"] = stage3_report
reports["stage4"] = stage4_report
return {
"source_results": stage2_result["source_results"],
"source_results": stage2_5_result["source_results"],
"items": rewritten_items,
"reports": reports,
}
@@ -95,6 +149,9 @@ def run_stage0_to_stage5(
fetcher: Fetcher,
semantic_llm_call: SemanticLlmCall,
rewrite_llm_call: RewriteLlmCall,
published_urls: PublishedUrls | None = None,
cross_day_dedup_enabled: bool = True,
cross_day_dedup_max_age_days: int = 7,
) -> dict[str, Any]:
stage4_result = run_stage0_to_stage4(
source_configs,
@@ -102,6 +159,9 @@ def run_stage0_to_stage5(
fetcher=fetcher,
semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call,
published_urls=published_urls,
cross_day_dedup_enabled=cross_day_dedup_enabled,
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
)
classified_items, stage5_report = classify_and_order_items(stage4_result["items"])
reports = dict(stage4_result["reports"])
@@ -121,6 +181,9 @@ def run_stage0_to_stage6(
semantic_llm_call: SemanticLlmCall,
rewrite_llm_call: RewriteLlmCall,
guide_llm_call: GuideLlmCall,
published_urls: PublishedUrls | None = None,
cross_day_dedup_enabled: bool = True,
cross_day_dedup_max_age_days: int = 7,
) -> dict[str, Any]:
stage5_result = run_stage0_to_stage5(
source_configs,
@@ -128,6 +191,9 @@ def run_stage0_to_stage6(
fetcher=fetcher,
semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call,
published_urls=published_urls,
cross_day_dedup_enabled=cross_day_dedup_enabled,
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
)
guide, stage6_report = generate_guide(stage5_result["items"], llm_call=guide_llm_call)
reports = dict(stage5_result["reports"])
@@ -148,6 +214,9 @@ def run_stage0_to_stage7(
semantic_llm_call: SemanticLlmCall,
rewrite_llm_call: RewriteLlmCall,
guide_llm_call: GuideLlmCall,
published_urls: PublishedUrls | None = None,
cross_day_dedup_enabled: bool = True,
cross_day_dedup_max_age_days: int = 7,
) -> dict[str, Any]:
stage6_result = run_stage0_to_stage6(
source_configs,
@@ -156,6 +225,9 @@ def run_stage0_to_stage7(
semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call,
guide_llm_call=guide_llm_call,
published_urls=published_urls,
cross_day_dedup_enabled=cross_day_dedup_enabled,
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
)
markdown, stage7_report = assemble_markdown(stage6_result["items"], stage6_result["guide"])
upstream_blocking_errors: list[str] = []
@@ -187,6 +259,9 @@ def run_stage0_to_stage8(
mode: str,
base_url: str,
client: BlogClient | None,
published_urls: PublishedUrls | None = None,
cross_day_dedup_enabled: bool = True,
cross_day_dedup_max_age_days: int = 7,
) -> dict[str, Any]:
stage7_result = run_stage0_to_stage7(
source_configs,
@@ -195,6 +270,9 @@ def run_stage0_to_stage8(
semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call,
guide_llm_call=guide_llm_call,
published_urls=published_urls,
cross_day_dedup_enabled=cross_day_dedup_enabled,
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
)
slug = f"ai-{run_date}"
publish_result = publish_markdown(

View File

@@ -1,8 +1,13 @@
from __future__ import annotations
import json
from dataclasses import dataclass
from datetime import date, datetime, timezone
from pathlib import Path
from typing import Any, Protocol
from .models import NewsItem, PublishedUrlEntry, PublishedUrls
@dataclass
class PublishResult:
@@ -22,6 +27,122 @@ class BlogClient(Protocol):
...
def _parse_date(value: str | None) -> date | None:
if not value:
return None
text = value.strip()
try:
return date.fromisoformat(text[:10])
except ValueError:
try:
return datetime.fromisoformat(text).date()
except ValueError:
return None
def _published_entry_from_dict(value: Any) -> PublishedUrlEntry | None:
if not isinstance(value, dict):
return None
first_seen = str(value.get("first_seen") or "")
last_published = str(value.get("last_published") or first_seen)
titles = [str(title) for title in value.get("titles", []) or [] if str(title)]
if not first_seen and not last_published:
return None
return PublishedUrlEntry(
first_seen=first_seen or last_published,
last_published=last_published or first_seen,
titles=titles,
)
def load_published_urls(path: Path) -> PublishedUrls:
if not path.exists():
return PublishedUrls()
try:
raw = json.loads(path.read_text(encoding="utf-8"))
except Exception:
return PublishedUrls()
if not isinstance(raw, dict):
return PublishedUrls()
urls: dict[str, PublishedUrlEntry] = {}
for canonical_url, value in (raw.get("urls") or {}).items():
if not canonical_url:
continue
entry = _published_entry_from_dict(value)
if entry is not None:
urls[str(canonical_url)] = entry
return PublishedUrls(
version=int(raw.get("version") or 1),
urls=urls,
updated_at=str(raw.get("updated_at") or ""),
)
def _entry_within_window(entry: PublishedUrlEntry, *, run_date: str, max_age_days: int) -> bool:
if max_age_days < 0:
return True
current = _parse_date(run_date)
previous = _parse_date(entry.last_published) or _parse_date(entry.first_seen)
if current is None or previous is None:
return True
return (current - previous).days <= max_age_days
def _published_urls_to_dict(history: PublishedUrls) -> dict[str, Any]:
return {
"version": history.version,
"urls": {
canonical_url: {
"first_seen": entry.first_seen,
"last_published": entry.last_published,
"titles": entry.titles,
}
for canonical_url, entry in sorted(history.urls.items())
},
"updated_at": history.updated_at,
}
def update_published_urls(
path: Path,
items: list[NewsItem],
*,
run_date: str,
max_age_days: int = 7,
) -> PublishedUrls:
history = load_published_urls(path)
history.urls = {
canonical_url: entry
for canonical_url, entry in history.urls.items()
if _entry_within_window(entry, run_date=run_date, max_age_days=max_age_days)
}
for item in items:
if not item.canonical_url:
continue
title = item.title or item.title_raw
entry = history.urls.get(item.canonical_url)
if entry is None:
entry = PublishedUrlEntry(
first_seen=run_date,
last_published=run_date,
titles=[],
)
history.urls[item.canonical_url] = entry
entry.last_published = run_date
if title and title not in entry.titles:
entry.titles.append(title)
history.updated_at = datetime.now(timezone.utc).isoformat()
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(
json.dumps(_published_urls_to_dict(history), ensure_ascii=False, indent=2),
encoding="utf-8",
)
return history
def dry_run_publish(slug: str, base_url: str) -> PublishResult:
return PublishResult(
mode="dry-run",

View File

@@ -6,10 +6,11 @@ from pathlib import Path
from typing import Any
from .clients import BlogApiClient, OpenAICompatibleClient, fetch_text as default_fetch_text
from .config import load_source_configs
from .config import load_pipeline_config, load_source_configs
from .env import load_env, resolve_blog_token, resolve_llm_config
from .models import SourceConfig
from .pipeline import run_stage0_to_stage8
from .publish import load_published_urls, update_published_urls
from .sources.registry import get_source_fetcher
@@ -89,6 +90,8 @@ def run_daily_report(
out_dir: Path,
base_url: str,
sources_path: Path | None = None,
pipeline_path: Path | None = None,
history_path: Path | None = None,
fetch_text=None,
env: dict[str, str] | None = None,
llm_client_factory=OpenAICompatibleClient,
@@ -96,6 +99,15 @@ def run_daily_report(
) -> dict[str, Any]:
fetch_text = fetch_text or default_fetch_text
env = env if env is not None else load_env()
pipeline_config_path = pipeline_path or Path("config") / "pipeline.json"
pipeline_config = load_pipeline_config(pipeline_config_path)
cross_day_config = pipeline_config.get("cross_day_dedup", {}) or {}
cross_day_enabled = bool(cross_day_config.get("enabled", True))
cross_day_max_age_days = int(cross_day_config.get("max_age_days", 7))
configured_history_path = history_path or Path(
str(cross_day_config.get("history_path") or "~/.hermes/scripts/ai_morning_out/published_urls.json")
).expanduser()
published_urls = load_published_urls(configured_history_path) if cross_day_enabled else None
if source_mode == "mock":
source_configs = _mock_source_configs()
@@ -141,8 +153,19 @@ def run_daily_report(
mode=mode,
base_url=base_url,
client=blog_client,
published_urls=published_urls,
cross_day_dedup_enabled=cross_day_enabled,
cross_day_dedup_max_age_days=cross_day_max_age_days,
)
if cross_day_enabled and result["publish"].mode == "publish" and result["publish"].status == "ok":
update_published_urls(
configured_history_path,
result["items"],
run_date=run_date,
max_age_days=cross_day_max_age_days,
)
run_dir = out_dir / run_date
run_dir.mkdir(parents=True, exist_ok=True)
(run_dir / "blog_markdown.md").write_text(result["markdown"], encoding="utf-8")

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
import xml.etree.ElementTree as ET
from datetime import date, datetime
from email.utils import parsedate_to_datetime
from typing import Any, Callable
@@ -20,16 +21,57 @@ def _parse_pubdate(value: str) -> str | None:
return None
def parse_rss_items(config: SourceConfig, xml_text: str, *, limit: int = 20) -> list[dict[str, Any]]:
def _parse_run_date(value: str | None) -> date | None:
if not value:
return None
try:
return date.fromisoformat(value[:10])
except ValueError:
return None
def _parse_iso_date(value: str | None) -> date | None:
if not value:
return None
try:
return datetime.fromisoformat(value).date()
except ValueError:
return None
def _within_max_item_age(published_at: str | None, *, run_date: str | None, max_item_age_days: int | None) -> bool:
if max_item_age_days is None:
return True
published_date = _parse_iso_date(published_at)
current_date = _parse_run_date(run_date)
if published_date is None or current_date is None:
return True
return (current_date - published_date).days <= max_item_age_days
def parse_rss_items(
config: SourceConfig,
xml_text: str,
*,
limit: int = 20,
run_date: str | None = None,
) -> list[dict[str, Any]]:
root = ET.fromstring(xml_text)
channel = root.find("channel")
raw_items = channel.findall("item") if channel is not None else []
items: list[dict[str, Any]] = []
for raw in raw_items[:limit]:
for raw in raw_items:
title = clean_text(raw.findtext("title") or "")
if not title:
continue
summary = clean_text(raw.findtext("description") or "")
published_at = _parse_pubdate(raw.findtext("pubDate") or "")
if not _within_max_item_age(
published_at,
run_date=run_date,
max_item_age_days=config.max_item_age_days,
):
continue
items.append(
{
"source_group": config.name,
@@ -37,15 +79,16 @@ def parse_rss_items(config: SourceConfig, xml_text: str, *, limit: int = 20) ->
"title_raw": title,
"summary_raw": summary,
"url": (raw.findtext("link") or "").strip(),
"published_at": _parse_pubdate(raw.findtext("pubDate") or ""),
"published_at": published_at,
"origin_type": "rss",
"section_hint": "",
"language_hint": "en" if title.encode("utf-8").isascii() else "zh",
}
)
if len(items) >= limit:
break
return items
def fetch_rss(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
return parse_rss_items(config, fetch_text(config.url, config.timeout_seconds))
return parse_rss_items(config, fetch_text(config.url, config.timeout_seconds), run_date=run_date)