fix: add cross-day dedupe
This commit is contained in:
@@ -17,6 +17,8 @@ def build_parser() -> argparse.ArgumentParser:
|
|||||||
run.add_argument("--out-dir", default="runs")
|
run.add_argument("--out-dir", default="runs")
|
||||||
run.add_argument("--base-url", default="https://blog.ephron.ren")
|
run.add_argument("--base-url", default="https://blog.ephron.ren")
|
||||||
run.add_argument("--sources-path", default=None)
|
run.add_argument("--sources-path", default=None)
|
||||||
|
run.add_argument("--pipeline-path", default=None)
|
||||||
|
run.add_argument("--history-path", default=None)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
@@ -32,6 +34,8 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
out_dir=Path(args.out_dir),
|
out_dir=Path(args.out_dir),
|
||||||
base_url=args.base_url,
|
base_url=args.base_url,
|
||||||
sources_path=Path(args.sources_path) if args.sources_path else None,
|
sources_path=Path(args.sources_path) if args.sources_path else None,
|
||||||
|
pipeline_path=Path(args.pipeline_path) if args.pipeline_path else None,
|
||||||
|
history_path=Path(args.history_path) if args.history_path else None,
|
||||||
)
|
)
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|||||||
@@ -17,3 +17,12 @@ def load_source_configs(path: Path) -> list[SourceConfig]:
|
|||||||
if not isinstance(raw, list):
|
if not isinstance(raw, list):
|
||||||
raise ValueError("sources config must be a list")
|
raise ValueError("sources config must be a list")
|
||||||
return [_source_config_from_dict(item) for item in raw]
|
return [_source_config_from_dict(item) for item in raw]
|
||||||
|
|
||||||
|
|
||||||
|
def load_pipeline_config(path: Path) -> dict[str, Any]:
|
||||||
|
if not path.exists():
|
||||||
|
return {}
|
||||||
|
raw = load_json(path)
|
||||||
|
if not isinstance(raw, dict):
|
||||||
|
raise ValueError("pipeline config must be an object")
|
||||||
|
return raw
|
||||||
|
|||||||
@@ -1,9 +1,16 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import difflib
|
import difflib
|
||||||
|
import re
|
||||||
|
from datetime import date, datetime
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from .models import NewsItem
|
from .models import NewsItem, PublishedUrlEntry, PublishedUrls
|
||||||
|
|
||||||
|
|
||||||
|
TITLE_SIMILARITY_THRESHOLD = 0.50
|
||||||
|
TOKEN_JACCARD_THRESHOLD = 0.40
|
||||||
|
TOKEN_EDIT_DISTANCE_THRESHOLD = 0.40
|
||||||
|
|
||||||
|
|
||||||
def _item_score(item: NewsItem) -> int:
|
def _item_score(item: NewsItem) -> int:
|
||||||
@@ -52,6 +59,18 @@ def _group_by_key(items: list[NewsItem], key_name: str) -> dict[str, list[NewsIt
|
|||||||
return {key: group for key, group in groups.items() if len(group) > 1}
|
return {key: group for key, group in groups.items() if len(group) > 1}
|
||||||
|
|
||||||
|
|
||||||
|
def _title_tokens(value: str) -> set[str]:
|
||||||
|
if not value:
|
||||||
|
return set()
|
||||||
|
return set(re.findall(r"[a-z0-9]+|[\u4e00-\u9fff]", value.lower()))
|
||||||
|
|
||||||
|
|
||||||
|
def _jaccard_similarity(left: set[str], right: set[str]) -> float:
|
||||||
|
if not left or not right:
|
||||||
|
return 0.0
|
||||||
|
return len(left & right) / len(left | right)
|
||||||
|
|
||||||
|
|
||||||
def _possible_duplicates(items: list[NewsItem]) -> list[dict[str, Any]]:
|
def _possible_duplicates(items: list[NewsItem]) -> list[dict[str, Any]]:
|
||||||
possible: list[dict[str, Any]] = []
|
possible: list[dict[str, Any]] = []
|
||||||
for index, left in enumerate(items):
|
for index, left in enumerate(items):
|
||||||
@@ -59,12 +78,16 @@ def _possible_duplicates(items: list[NewsItem]) -> list[dict[str, Any]]:
|
|||||||
if not left.title_norm or not right.title_norm:
|
if not left.title_norm or not right.title_norm:
|
||||||
continue
|
continue
|
||||||
ratio = difflib.SequenceMatcher(None, left.title_norm, right.title_norm).ratio()
|
ratio = difflib.SequenceMatcher(None, left.title_norm, right.title_norm).ratio()
|
||||||
if ratio >= 0.65:
|
jaccard = _jaccard_similarity(_title_tokens(left.title_norm), _title_tokens(right.title_norm))
|
||||||
|
if ratio >= TITLE_SIMILARITY_THRESHOLD or (
|
||||||
|
ratio >= TOKEN_EDIT_DISTANCE_THRESHOLD and jaccard >= TOKEN_JACCARD_THRESHOLD
|
||||||
|
):
|
||||||
possible.append(
|
possible.append(
|
||||||
{
|
{
|
||||||
"item_ids": [left.id, right.id],
|
"item_ids": [left.id, right.id],
|
||||||
"reason": "title_similarity",
|
"reason": "title_similarity",
|
||||||
"similarity": round(ratio, 3),
|
"similarity": round(ratio, 3),
|
||||||
|
"token_jaccard": round(jaccard, 3),
|
||||||
"confidence": "medium",
|
"confidence": "medium",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@@ -98,3 +121,62 @@ def hard_dedup_items(items: list[NewsItem]) -> tuple[list[NewsItem], dict[str, A
|
|||||||
"possible_duplicates": _possible_duplicates(deduped),
|
"possible_duplicates": _possible_duplicates(deduped),
|
||||||
}
|
}
|
||||||
return deduped, report
|
return deduped, report
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_date(value: str | None) -> date | None:
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
text = value.strip()
|
||||||
|
try:
|
||||||
|
return date.fromisoformat(text[:10])
|
||||||
|
except ValueError:
|
||||||
|
try:
|
||||||
|
return datetime.fromisoformat(text).date()
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _entry_within_window(entry: PublishedUrlEntry, *, run_date: str, max_age_days: int) -> bool:
|
||||||
|
if max_age_days < 0:
|
||||||
|
return True
|
||||||
|
current = _parse_date(run_date)
|
||||||
|
previous = _parse_date(entry.last_published) or _parse_date(entry.first_seen)
|
||||||
|
if current is None or previous is None:
|
||||||
|
return True
|
||||||
|
return (current - previous).days <= max_age_days
|
||||||
|
|
||||||
|
|
||||||
|
def cross_day_dedup_items(
|
||||||
|
items: list[NewsItem],
|
||||||
|
published_urls: PublishedUrls | None,
|
||||||
|
*,
|
||||||
|
run_date: str,
|
||||||
|
max_age_days: int = 7,
|
||||||
|
) -> tuple[list[NewsItem], dict[str, Any]]:
|
||||||
|
history = published_urls or PublishedUrls()
|
||||||
|
deduped: list[NewsItem] = []
|
||||||
|
removed: list[dict[str, Any]] = []
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
entry = history.urls.get(item.canonical_url) if item.canonical_url else None
|
||||||
|
if entry and _entry_within_window(entry, run_date=run_date, max_age_days=max_age_days):
|
||||||
|
removed.append(
|
||||||
|
{
|
||||||
|
"item_id": item.id,
|
||||||
|
"canonical_url": item.canonical_url,
|
||||||
|
"title": item.title or item.title_raw,
|
||||||
|
"first_seen": entry.first_seen,
|
||||||
|
"last_published": entry.last_published,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
deduped.append(item)
|
||||||
|
|
||||||
|
report = {
|
||||||
|
"input_count": len(items),
|
||||||
|
"output_count": len(deduped),
|
||||||
|
"removed_count": len(removed),
|
||||||
|
"removed": removed,
|
||||||
|
"max_age_days": max_age_days,
|
||||||
|
}
|
||||||
|
return deduped, report
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ class SourceConfig:
|
|||||||
retries: int = 0
|
retries: int = 0
|
||||||
min_items: int = 0
|
min_items: int = 0
|
||||||
url: str = ""
|
url: str = ""
|
||||||
|
max_item_age_days: int | None = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -51,3 +52,17 @@ class NewsItem:
|
|||||||
section: str | None = None
|
section: str | None = None
|
||||||
quality_flags: list[str] = field(default_factory=list)
|
quality_flags: list[str] = field(default_factory=list)
|
||||||
duplicate_sources: list[dict[str, Any]] = field(default_factory=list)
|
duplicate_sources: list[dict[str, Any]] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PublishedUrlEntry:
|
||||||
|
first_seen: str
|
||||||
|
last_published: str
|
||||||
|
titles: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PublishedUrls:
|
||||||
|
version: int = 1
|
||||||
|
urls: dict[str, PublishedUrlEntry] = field(default_factory=dict)
|
||||||
|
updated_at: str = ""
|
||||||
|
|||||||
@@ -5,9 +5,9 @@ from typing import Any
|
|||||||
from .assemble import assemble_markdown
|
from .assemble import assemble_markdown
|
||||||
from .classify import classify_and_order_items
|
from .classify import classify_and_order_items
|
||||||
from .collect import Fetcher, collect_sources
|
from .collect import Fetcher, collect_sources
|
||||||
from .dedupe import hard_dedup_items
|
from .dedupe import cross_day_dedup_items, hard_dedup_items
|
||||||
from .guide import GuideLlmCall, generate_guide
|
from .guide import GuideLlmCall, generate_guide
|
||||||
from .models import SourceConfig
|
from .models import PublishedUrls, SourceConfig
|
||||||
from .normalize import normalize_items
|
from .normalize import normalize_items
|
||||||
from .publish import BlogClient, publish_markdown
|
from .publish import BlogClient, publish_markdown
|
||||||
from .rewrite import RewriteLlmCall, rewrite_items
|
from .rewrite import RewriteLlmCall, rewrite_items
|
||||||
@@ -15,6 +15,7 @@ from .semantic_dedupe import SemanticLlmCall, semantic_dedup_items
|
|||||||
|
|
||||||
|
|
||||||
def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig:
|
def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig:
|
||||||
|
max_item_age_days = value.get("max_item_age_days")
|
||||||
return SourceConfig(
|
return SourceConfig(
|
||||||
name=value["name"],
|
name=value["name"],
|
||||||
type=value["type"],
|
type=value["type"],
|
||||||
@@ -26,6 +27,7 @@ def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig:
|
|||||||
retries=int(value.get("retries", 0)),
|
retries=int(value.get("retries", 0)),
|
||||||
min_items=int(value.get("min_items", 0)),
|
min_items=int(value.get("min_items", 0)),
|
||||||
url=value.get("url", ""),
|
url=value.get("url", ""),
|
||||||
|
max_item_age_days=int(max_item_age_days) if max_item_age_days is not None else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -58,6 +60,43 @@ def run_stage0_to_stage2(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def run_stage0_to_stage2_5(
|
||||||
|
source_configs: list[dict[str, Any] | SourceConfig],
|
||||||
|
run_date: str,
|
||||||
|
*,
|
||||||
|
fetcher: Fetcher,
|
||||||
|
published_urls: PublishedUrls | None = None,
|
||||||
|
cross_day_dedup_enabled: bool = True,
|
||||||
|
cross_day_dedup_max_age_days: int = 7,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
stage2_result = run_stage0_to_stage2(source_configs, run_date, fetcher=fetcher)
|
||||||
|
if cross_day_dedup_enabled:
|
||||||
|
items, stage2_5_report = cross_day_dedup_items(
|
||||||
|
stage2_result["items"],
|
||||||
|
published_urls,
|
||||||
|
run_date=run_date,
|
||||||
|
max_age_days=cross_day_dedup_max_age_days,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
items = stage2_result["items"]
|
||||||
|
stage2_5_report = {
|
||||||
|
"input_count": len(items),
|
||||||
|
"output_count": len(items),
|
||||||
|
"removed_count": 0,
|
||||||
|
"removed": [],
|
||||||
|
"enabled": False,
|
||||||
|
"max_age_days": cross_day_dedup_max_age_days,
|
||||||
|
}
|
||||||
|
reports = dict(stage2_result["reports"])
|
||||||
|
stage2_5_report.setdefault("enabled", cross_day_dedup_enabled)
|
||||||
|
reports["stage2_5"] = stage2_5_report
|
||||||
|
return {
|
||||||
|
"source_results": stage2_result["source_results"],
|
||||||
|
"items": items,
|
||||||
|
"reports": reports,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def run_stage0_to_stage4(
|
def run_stage0_to_stage4(
|
||||||
source_configs: list[dict[str, Any] | SourceConfig],
|
source_configs: list[dict[str, Any] | SourceConfig],
|
||||||
run_date: str,
|
run_date: str,
|
||||||
@@ -65,10 +104,25 @@ def run_stage0_to_stage4(
|
|||||||
fetcher: Fetcher,
|
fetcher: Fetcher,
|
||||||
semantic_llm_call: SemanticLlmCall,
|
semantic_llm_call: SemanticLlmCall,
|
||||||
rewrite_llm_call: RewriteLlmCall,
|
rewrite_llm_call: RewriteLlmCall,
|
||||||
|
published_urls: PublishedUrls | None = None,
|
||||||
|
cross_day_dedup_enabled: bool = True,
|
||||||
|
cross_day_dedup_max_age_days: int = 7,
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
stage2_result = run_stage0_to_stage2(source_configs, run_date, fetcher=fetcher)
|
stage2_5_result = run_stage0_to_stage2_5(
|
||||||
items = stage2_result["items"]
|
source_configs,
|
||||||
candidates = stage2_result["reports"]["stage2"].get("possible_duplicates", [])
|
run_date,
|
||||||
|
fetcher=fetcher,
|
||||||
|
published_urls=published_urls,
|
||||||
|
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
||||||
|
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
||||||
|
)
|
||||||
|
items = stage2_5_result["items"]
|
||||||
|
remaining_ids = {item.id for item in items}
|
||||||
|
candidates = [
|
||||||
|
candidate
|
||||||
|
for candidate in stage2_5_result["reports"]["stage2"].get("possible_duplicates", [])
|
||||||
|
if set(candidate.get("item_ids", [])).issubset(remaining_ids)
|
||||||
|
]
|
||||||
semantic_items, stage3_report = semantic_dedup_items(
|
semantic_items, stage3_report = semantic_dedup_items(
|
||||||
items,
|
items,
|
||||||
candidates,
|
candidates,
|
||||||
@@ -78,11 +132,11 @@ def run_stage0_to_stage4(
|
|||||||
semantic_items,
|
semantic_items,
|
||||||
llm_call=rewrite_llm_call,
|
llm_call=rewrite_llm_call,
|
||||||
)
|
)
|
||||||
reports = dict(stage2_result["reports"])
|
reports = dict(stage2_5_result["reports"])
|
||||||
reports["stage3"] = stage3_report
|
reports["stage3"] = stage3_report
|
||||||
reports["stage4"] = stage4_report
|
reports["stage4"] = stage4_report
|
||||||
return {
|
return {
|
||||||
"source_results": stage2_result["source_results"],
|
"source_results": stage2_5_result["source_results"],
|
||||||
"items": rewritten_items,
|
"items": rewritten_items,
|
||||||
"reports": reports,
|
"reports": reports,
|
||||||
}
|
}
|
||||||
@@ -95,6 +149,9 @@ def run_stage0_to_stage5(
|
|||||||
fetcher: Fetcher,
|
fetcher: Fetcher,
|
||||||
semantic_llm_call: SemanticLlmCall,
|
semantic_llm_call: SemanticLlmCall,
|
||||||
rewrite_llm_call: RewriteLlmCall,
|
rewrite_llm_call: RewriteLlmCall,
|
||||||
|
published_urls: PublishedUrls | None = None,
|
||||||
|
cross_day_dedup_enabled: bool = True,
|
||||||
|
cross_day_dedup_max_age_days: int = 7,
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
stage4_result = run_stage0_to_stage4(
|
stage4_result = run_stage0_to_stage4(
|
||||||
source_configs,
|
source_configs,
|
||||||
@@ -102,6 +159,9 @@ def run_stage0_to_stage5(
|
|||||||
fetcher=fetcher,
|
fetcher=fetcher,
|
||||||
semantic_llm_call=semantic_llm_call,
|
semantic_llm_call=semantic_llm_call,
|
||||||
rewrite_llm_call=rewrite_llm_call,
|
rewrite_llm_call=rewrite_llm_call,
|
||||||
|
published_urls=published_urls,
|
||||||
|
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
||||||
|
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
||||||
)
|
)
|
||||||
classified_items, stage5_report = classify_and_order_items(stage4_result["items"])
|
classified_items, stage5_report = classify_and_order_items(stage4_result["items"])
|
||||||
reports = dict(stage4_result["reports"])
|
reports = dict(stage4_result["reports"])
|
||||||
@@ -121,6 +181,9 @@ def run_stage0_to_stage6(
|
|||||||
semantic_llm_call: SemanticLlmCall,
|
semantic_llm_call: SemanticLlmCall,
|
||||||
rewrite_llm_call: RewriteLlmCall,
|
rewrite_llm_call: RewriteLlmCall,
|
||||||
guide_llm_call: GuideLlmCall,
|
guide_llm_call: GuideLlmCall,
|
||||||
|
published_urls: PublishedUrls | None = None,
|
||||||
|
cross_day_dedup_enabled: bool = True,
|
||||||
|
cross_day_dedup_max_age_days: int = 7,
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
stage5_result = run_stage0_to_stage5(
|
stage5_result = run_stage0_to_stage5(
|
||||||
source_configs,
|
source_configs,
|
||||||
@@ -128,6 +191,9 @@ def run_stage0_to_stage6(
|
|||||||
fetcher=fetcher,
|
fetcher=fetcher,
|
||||||
semantic_llm_call=semantic_llm_call,
|
semantic_llm_call=semantic_llm_call,
|
||||||
rewrite_llm_call=rewrite_llm_call,
|
rewrite_llm_call=rewrite_llm_call,
|
||||||
|
published_urls=published_urls,
|
||||||
|
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
||||||
|
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
||||||
)
|
)
|
||||||
guide, stage6_report = generate_guide(stage5_result["items"], llm_call=guide_llm_call)
|
guide, stage6_report = generate_guide(stage5_result["items"], llm_call=guide_llm_call)
|
||||||
reports = dict(stage5_result["reports"])
|
reports = dict(stage5_result["reports"])
|
||||||
@@ -148,6 +214,9 @@ def run_stage0_to_stage7(
|
|||||||
semantic_llm_call: SemanticLlmCall,
|
semantic_llm_call: SemanticLlmCall,
|
||||||
rewrite_llm_call: RewriteLlmCall,
|
rewrite_llm_call: RewriteLlmCall,
|
||||||
guide_llm_call: GuideLlmCall,
|
guide_llm_call: GuideLlmCall,
|
||||||
|
published_urls: PublishedUrls | None = None,
|
||||||
|
cross_day_dedup_enabled: bool = True,
|
||||||
|
cross_day_dedup_max_age_days: int = 7,
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
stage6_result = run_stage0_to_stage6(
|
stage6_result = run_stage0_to_stage6(
|
||||||
source_configs,
|
source_configs,
|
||||||
@@ -156,6 +225,9 @@ def run_stage0_to_stage7(
|
|||||||
semantic_llm_call=semantic_llm_call,
|
semantic_llm_call=semantic_llm_call,
|
||||||
rewrite_llm_call=rewrite_llm_call,
|
rewrite_llm_call=rewrite_llm_call,
|
||||||
guide_llm_call=guide_llm_call,
|
guide_llm_call=guide_llm_call,
|
||||||
|
published_urls=published_urls,
|
||||||
|
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
||||||
|
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
||||||
)
|
)
|
||||||
markdown, stage7_report = assemble_markdown(stage6_result["items"], stage6_result["guide"])
|
markdown, stage7_report = assemble_markdown(stage6_result["items"], stage6_result["guide"])
|
||||||
upstream_blocking_errors: list[str] = []
|
upstream_blocking_errors: list[str] = []
|
||||||
@@ -187,6 +259,9 @@ def run_stage0_to_stage8(
|
|||||||
mode: str,
|
mode: str,
|
||||||
base_url: str,
|
base_url: str,
|
||||||
client: BlogClient | None,
|
client: BlogClient | None,
|
||||||
|
published_urls: PublishedUrls | None = None,
|
||||||
|
cross_day_dedup_enabled: bool = True,
|
||||||
|
cross_day_dedup_max_age_days: int = 7,
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
stage7_result = run_stage0_to_stage7(
|
stage7_result = run_stage0_to_stage7(
|
||||||
source_configs,
|
source_configs,
|
||||||
@@ -195,6 +270,9 @@ def run_stage0_to_stage8(
|
|||||||
semantic_llm_call=semantic_llm_call,
|
semantic_llm_call=semantic_llm_call,
|
||||||
rewrite_llm_call=rewrite_llm_call,
|
rewrite_llm_call=rewrite_llm_call,
|
||||||
guide_llm_call=guide_llm_call,
|
guide_llm_call=guide_llm_call,
|
||||||
|
published_urls=published_urls,
|
||||||
|
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
||||||
|
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
||||||
)
|
)
|
||||||
slug = f"ai-{run_date}"
|
slug = f"ai-{run_date}"
|
||||||
publish_result = publish_markdown(
|
publish_result = publish_markdown(
|
||||||
|
|||||||
@@ -1,8 +1,13 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from datetime import date, datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
from typing import Any, Protocol
|
from typing import Any, Protocol
|
||||||
|
|
||||||
|
from .models import NewsItem, PublishedUrlEntry, PublishedUrls
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class PublishResult:
|
class PublishResult:
|
||||||
@@ -22,6 +27,122 @@ class BlogClient(Protocol):
|
|||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_date(value: str | None) -> date | None:
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
text = value.strip()
|
||||||
|
try:
|
||||||
|
return date.fromisoformat(text[:10])
|
||||||
|
except ValueError:
|
||||||
|
try:
|
||||||
|
return datetime.fromisoformat(text).date()
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _published_entry_from_dict(value: Any) -> PublishedUrlEntry | None:
|
||||||
|
if not isinstance(value, dict):
|
||||||
|
return None
|
||||||
|
first_seen = str(value.get("first_seen") or "")
|
||||||
|
last_published = str(value.get("last_published") or first_seen)
|
||||||
|
titles = [str(title) for title in value.get("titles", []) or [] if str(title)]
|
||||||
|
if not first_seen and not last_published:
|
||||||
|
return None
|
||||||
|
return PublishedUrlEntry(
|
||||||
|
first_seen=first_seen or last_published,
|
||||||
|
last_published=last_published or first_seen,
|
||||||
|
titles=titles,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_published_urls(path: Path) -> PublishedUrls:
|
||||||
|
if not path.exists():
|
||||||
|
return PublishedUrls()
|
||||||
|
try:
|
||||||
|
raw = json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
except Exception:
|
||||||
|
return PublishedUrls()
|
||||||
|
if not isinstance(raw, dict):
|
||||||
|
return PublishedUrls()
|
||||||
|
|
||||||
|
urls: dict[str, PublishedUrlEntry] = {}
|
||||||
|
for canonical_url, value in (raw.get("urls") or {}).items():
|
||||||
|
if not canonical_url:
|
||||||
|
continue
|
||||||
|
entry = _published_entry_from_dict(value)
|
||||||
|
if entry is not None:
|
||||||
|
urls[str(canonical_url)] = entry
|
||||||
|
return PublishedUrls(
|
||||||
|
version=int(raw.get("version") or 1),
|
||||||
|
urls=urls,
|
||||||
|
updated_at=str(raw.get("updated_at") or ""),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _entry_within_window(entry: PublishedUrlEntry, *, run_date: str, max_age_days: int) -> bool:
|
||||||
|
if max_age_days < 0:
|
||||||
|
return True
|
||||||
|
current = _parse_date(run_date)
|
||||||
|
previous = _parse_date(entry.last_published) or _parse_date(entry.first_seen)
|
||||||
|
if current is None or previous is None:
|
||||||
|
return True
|
||||||
|
return (current - previous).days <= max_age_days
|
||||||
|
|
||||||
|
|
||||||
|
def _published_urls_to_dict(history: PublishedUrls) -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"version": history.version,
|
||||||
|
"urls": {
|
||||||
|
canonical_url: {
|
||||||
|
"first_seen": entry.first_seen,
|
||||||
|
"last_published": entry.last_published,
|
||||||
|
"titles": entry.titles,
|
||||||
|
}
|
||||||
|
for canonical_url, entry in sorted(history.urls.items())
|
||||||
|
},
|
||||||
|
"updated_at": history.updated_at,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def update_published_urls(
|
||||||
|
path: Path,
|
||||||
|
items: list[NewsItem],
|
||||||
|
*,
|
||||||
|
run_date: str,
|
||||||
|
max_age_days: int = 7,
|
||||||
|
) -> PublishedUrls:
|
||||||
|
history = load_published_urls(path)
|
||||||
|
history.urls = {
|
||||||
|
canonical_url: entry
|
||||||
|
for canonical_url, entry in history.urls.items()
|
||||||
|
if _entry_within_window(entry, run_date=run_date, max_age_days=max_age_days)
|
||||||
|
}
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
if not item.canonical_url:
|
||||||
|
continue
|
||||||
|
title = item.title or item.title_raw
|
||||||
|
entry = history.urls.get(item.canonical_url)
|
||||||
|
if entry is None:
|
||||||
|
entry = PublishedUrlEntry(
|
||||||
|
first_seen=run_date,
|
||||||
|
last_published=run_date,
|
||||||
|
titles=[],
|
||||||
|
)
|
||||||
|
history.urls[item.canonical_url] = entry
|
||||||
|
entry.last_published = run_date
|
||||||
|
if title and title not in entry.titles:
|
||||||
|
entry.titles.append(title)
|
||||||
|
|
||||||
|
history.updated_at = datetime.now(timezone.utc).isoformat()
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
path.write_text(
|
||||||
|
json.dumps(_published_urls_to_dict(history), ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
return history
|
||||||
|
|
||||||
|
|
||||||
def dry_run_publish(slug: str, base_url: str) -> PublishResult:
|
def dry_run_publish(slug: str, base_url: str) -> PublishResult:
|
||||||
return PublishResult(
|
return PublishResult(
|
||||||
mode="dry-run",
|
mode="dry-run",
|
||||||
|
|||||||
@@ -6,10 +6,11 @@ from pathlib import Path
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from .clients import BlogApiClient, OpenAICompatibleClient, fetch_text as default_fetch_text
|
from .clients import BlogApiClient, OpenAICompatibleClient, fetch_text as default_fetch_text
|
||||||
from .config import load_source_configs
|
from .config import load_pipeline_config, load_source_configs
|
||||||
from .env import load_env, resolve_blog_token, resolve_llm_config
|
from .env import load_env, resolve_blog_token, resolve_llm_config
|
||||||
from .models import SourceConfig
|
from .models import SourceConfig
|
||||||
from .pipeline import run_stage0_to_stage8
|
from .pipeline import run_stage0_to_stage8
|
||||||
|
from .publish import load_published_urls, update_published_urls
|
||||||
from .sources.registry import get_source_fetcher
|
from .sources.registry import get_source_fetcher
|
||||||
|
|
||||||
|
|
||||||
@@ -89,6 +90,8 @@ def run_daily_report(
|
|||||||
out_dir: Path,
|
out_dir: Path,
|
||||||
base_url: str,
|
base_url: str,
|
||||||
sources_path: Path | None = None,
|
sources_path: Path | None = None,
|
||||||
|
pipeline_path: Path | None = None,
|
||||||
|
history_path: Path | None = None,
|
||||||
fetch_text=None,
|
fetch_text=None,
|
||||||
env: dict[str, str] | None = None,
|
env: dict[str, str] | None = None,
|
||||||
llm_client_factory=OpenAICompatibleClient,
|
llm_client_factory=OpenAICompatibleClient,
|
||||||
@@ -96,6 +99,15 @@ def run_daily_report(
|
|||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
fetch_text = fetch_text or default_fetch_text
|
fetch_text = fetch_text or default_fetch_text
|
||||||
env = env if env is not None else load_env()
|
env = env if env is not None else load_env()
|
||||||
|
pipeline_config_path = pipeline_path or Path("config") / "pipeline.json"
|
||||||
|
pipeline_config = load_pipeline_config(pipeline_config_path)
|
||||||
|
cross_day_config = pipeline_config.get("cross_day_dedup", {}) or {}
|
||||||
|
cross_day_enabled = bool(cross_day_config.get("enabled", True))
|
||||||
|
cross_day_max_age_days = int(cross_day_config.get("max_age_days", 7))
|
||||||
|
configured_history_path = history_path or Path(
|
||||||
|
str(cross_day_config.get("history_path") or "~/.hermes/scripts/ai_morning_out/published_urls.json")
|
||||||
|
).expanduser()
|
||||||
|
published_urls = load_published_urls(configured_history_path) if cross_day_enabled else None
|
||||||
|
|
||||||
if source_mode == "mock":
|
if source_mode == "mock":
|
||||||
source_configs = _mock_source_configs()
|
source_configs = _mock_source_configs()
|
||||||
@@ -141,6 +153,17 @@ def run_daily_report(
|
|||||||
mode=mode,
|
mode=mode,
|
||||||
base_url=base_url,
|
base_url=base_url,
|
||||||
client=blog_client,
|
client=blog_client,
|
||||||
|
published_urls=published_urls,
|
||||||
|
cross_day_dedup_enabled=cross_day_enabled,
|
||||||
|
cross_day_dedup_max_age_days=cross_day_max_age_days,
|
||||||
|
)
|
||||||
|
|
||||||
|
if cross_day_enabled and result["publish"].mode == "publish" and result["publish"].status == "ok":
|
||||||
|
update_published_urls(
|
||||||
|
configured_history_path,
|
||||||
|
result["items"],
|
||||||
|
run_date=run_date,
|
||||||
|
max_age_days=cross_day_max_age_days,
|
||||||
)
|
)
|
||||||
|
|
||||||
run_dir = out_dir / run_date
|
run_dir = out_dir / run_date
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
|
from datetime import date, datetime
|
||||||
from email.utils import parsedate_to_datetime
|
from email.utils import parsedate_to_datetime
|
||||||
from typing import Any, Callable
|
from typing import Any, Callable
|
||||||
|
|
||||||
@@ -20,16 +21,57 @@ def _parse_pubdate(value: str) -> str | None:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def parse_rss_items(config: SourceConfig, xml_text: str, *, limit: int = 20) -> list[dict[str, Any]]:
|
def _parse_run_date(value: str | None) -> date | None:
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return date.fromisoformat(value[:10])
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_iso_date(value: str | None) -> date | None:
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return datetime.fromisoformat(value).date()
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _within_max_item_age(published_at: str | None, *, run_date: str | None, max_item_age_days: int | None) -> bool:
|
||||||
|
if max_item_age_days is None:
|
||||||
|
return True
|
||||||
|
published_date = _parse_iso_date(published_at)
|
||||||
|
current_date = _parse_run_date(run_date)
|
||||||
|
if published_date is None or current_date is None:
|
||||||
|
return True
|
||||||
|
return (current_date - published_date).days <= max_item_age_days
|
||||||
|
|
||||||
|
|
||||||
|
def parse_rss_items(
|
||||||
|
config: SourceConfig,
|
||||||
|
xml_text: str,
|
||||||
|
*,
|
||||||
|
limit: int = 20,
|
||||||
|
run_date: str | None = None,
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
root = ET.fromstring(xml_text)
|
root = ET.fromstring(xml_text)
|
||||||
channel = root.find("channel")
|
channel = root.find("channel")
|
||||||
raw_items = channel.findall("item") if channel is not None else []
|
raw_items = channel.findall("item") if channel is not None else []
|
||||||
items: list[dict[str, Any]] = []
|
items: list[dict[str, Any]] = []
|
||||||
for raw in raw_items[:limit]:
|
for raw in raw_items:
|
||||||
title = clean_text(raw.findtext("title") or "")
|
title = clean_text(raw.findtext("title") or "")
|
||||||
if not title:
|
if not title:
|
||||||
continue
|
continue
|
||||||
summary = clean_text(raw.findtext("description") or "")
|
summary = clean_text(raw.findtext("description") or "")
|
||||||
|
published_at = _parse_pubdate(raw.findtext("pubDate") or "")
|
||||||
|
if not _within_max_item_age(
|
||||||
|
published_at,
|
||||||
|
run_date=run_date,
|
||||||
|
max_item_age_days=config.max_item_age_days,
|
||||||
|
):
|
||||||
|
continue
|
||||||
items.append(
|
items.append(
|
||||||
{
|
{
|
||||||
"source_group": config.name,
|
"source_group": config.name,
|
||||||
@@ -37,15 +79,16 @@ def parse_rss_items(config: SourceConfig, xml_text: str, *, limit: int = 20) ->
|
|||||||
"title_raw": title,
|
"title_raw": title,
|
||||||
"summary_raw": summary,
|
"summary_raw": summary,
|
||||||
"url": (raw.findtext("link") or "").strip(),
|
"url": (raw.findtext("link") or "").strip(),
|
||||||
"published_at": _parse_pubdate(raw.findtext("pubDate") or ""),
|
"published_at": published_at,
|
||||||
"origin_type": "rss",
|
"origin_type": "rss",
|
||||||
"section_hint": "",
|
"section_hint": "",
|
||||||
"language_hint": "en" if title.encode("utf-8").isascii() else "zh",
|
"language_hint": "en" if title.encode("utf-8").isascii() else "zh",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
if len(items) >= limit:
|
||||||
|
break
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
def fetch_rss(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
|
def fetch_rss(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
|
||||||
return parse_rss_items(config, fetch_text(config.url, config.timeout_seconds))
|
return parse_rss_items(config, fetch_text(config.url, config.timeout_seconds), run_date=run_date)
|
||||||
|
|
||||||
|
|||||||
@@ -11,6 +11,10 @@
|
|||||||
],
|
],
|
||||||
"rewrite_batch_size": 10,
|
"rewrite_batch_size": 10,
|
||||||
"semantic_dedup_max_deletion_ratio": 0.5,
|
"semantic_dedup_max_deletion_ratio": 0.5,
|
||||||
"default_mode": "dry-run"
|
"default_mode": "dry-run",
|
||||||
|
"cross_day_dedup": {
|
||||||
|
"enabled": true,
|
||||||
|
"max_age_days": 7,
|
||||||
|
"history_path": "~/.hermes/scripts/ai_morning_out/published_urls.json"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -19,6 +19,7 @@
|
|||||||
"priority": 40,
|
"priority": 40,
|
||||||
"timeout_seconds": 25,
|
"timeout_seconds": 25,
|
||||||
"retries": 1,
|
"retries": 1,
|
||||||
|
"max_item_age_days": 3,
|
||||||
"enabled": true
|
"enabled": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -30,6 +31,7 @@
|
|||||||
"priority": 50,
|
"priority": 50,
|
||||||
"timeout_seconds": 25,
|
"timeout_seconds": 25,
|
||||||
"retries": 1,
|
"retries": 1,
|
||||||
|
"max_item_age_days": 5,
|
||||||
"enabled": true
|
"enabled": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -55,4 +57,3 @@
|
|||||||
"enabled": true
|
"enabled": true
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -16,6 +16,12 @@ class ConfigLoadingTests(unittest.TestCase):
|
|||||||
self.assertEqual(configs[0].name, "AI HOT")
|
self.assertEqual(configs[0].name, "AI HOT")
|
||||||
self.assertEqual(configs[0].type, "aihot")
|
self.assertEqual(configs[0].type, "aihot")
|
||||||
|
|
||||||
|
def test_rss_configs_can_set_max_item_age_days(self):
|
||||||
|
configs = load_source_configs(ROOT / "config" / "sources.json")
|
||||||
|
by_name = {config.name: config for config in configs}
|
||||||
|
|
||||||
|
self.assertEqual(by_name["InfoQ AI"].max_item_age_days, 3)
|
||||||
|
|
||||||
def test_all_configured_source_types_are_registered(self):
|
def test_all_configured_source_types_are_registered(self):
|
||||||
configs = load_source_configs(ROOT / "config" / "sources.json")
|
configs = load_source_configs(ROOT / "config" / "sources.json")
|
||||||
|
|
||||||
|
|||||||
58
tests/test_rss.py
Normal file
58
tests/test_rss.py
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
import unittest
|
||||||
|
|
||||||
|
from ai_daily_report.models import SourceConfig
|
||||||
|
from ai_daily_report.sources.rss import parse_rss_items
|
||||||
|
|
||||||
|
|
||||||
|
class RssSourceTests(unittest.TestCase):
|
||||||
|
def test_parse_rss_items_filters_entries_older_than_configured_age(self):
|
||||||
|
config = SourceConfig(
|
||||||
|
name="InfoQ AI",
|
||||||
|
type="rss",
|
||||||
|
url="https://feed.example/rss",
|
||||||
|
max_item_age_days=3,
|
||||||
|
)
|
||||||
|
xml = """<?xml version="1.0"?>
|
||||||
|
<rss><channel>
|
||||||
|
<item>
|
||||||
|
<title>Fresh item</title>
|
||||||
|
<link>https://example.com/fresh</link>
|
||||||
|
<description>Fresh summary</description>
|
||||||
|
<pubDate>Sun, 07 Jun 2026 06:25:00 GMT</pubDate>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<title>Old item</title>
|
||||||
|
<link>https://example.com/old</link>
|
||||||
|
<description>Old summary</description>
|
||||||
|
<pubDate>Mon, 01 Jun 2026 06:25:00 GMT</pubDate>
|
||||||
|
</item>
|
||||||
|
</channel></rss>"""
|
||||||
|
|
||||||
|
items = parse_rss_items(config, xml, run_date="2026-06-08")
|
||||||
|
|
||||||
|
self.assertEqual([item["title_raw"] for item in items], ["Fresh item"])
|
||||||
|
|
||||||
|
def test_parse_rss_items_keeps_unparseable_dates_to_avoid_false_drops(self):
|
||||||
|
config = SourceConfig(
|
||||||
|
name="InfoQ AI",
|
||||||
|
type="rss",
|
||||||
|
url="https://feed.example/rss",
|
||||||
|
max_item_age_days=3,
|
||||||
|
)
|
||||||
|
xml = """<?xml version="1.0"?>
|
||||||
|
<rss><channel>
|
||||||
|
<item>
|
||||||
|
<title>No date item</title>
|
||||||
|
<link>https://example.com/no-date</link>
|
||||||
|
<description>No date summary</description>
|
||||||
|
<pubDate>not a date</pubDate>
|
||||||
|
</item>
|
||||||
|
</channel></rss>"""
|
||||||
|
|
||||||
|
items = parse_rss_items(config, xml, run_date="2026-06-08")
|
||||||
|
|
||||||
|
self.assertEqual([item["title_raw"] for item in items], ["No date item"])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -3,6 +3,7 @@ import json
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from tempfile import TemporaryDirectory
|
from tempfile import TemporaryDirectory
|
||||||
|
|
||||||
|
from ai_daily_report.publish import load_published_urls
|
||||||
from ai_daily_report.runner import run_daily_report
|
from ai_daily_report.runner import run_daily_report
|
||||||
|
|
||||||
|
|
||||||
@@ -127,6 +128,36 @@ class RunnerTests(unittest.TestCase):
|
|||||||
self.assertGreaterEqual(len(fake_client.prompts), 2)
|
self.assertGreaterEqual(len(fake_client.prompts), 2)
|
||||||
self.assertEqual(result["reports"]["stage8"]["status"], "ok")
|
self.assertEqual(result["reports"]["stage8"]["status"], "ok")
|
||||||
|
|
||||||
|
def test_run_daily_report_publish_updates_published_url_history(self):
|
||||||
|
class FakeBlogClient:
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
self.kwargs = kwargs
|
||||||
|
|
||||||
|
def create_post(self, payload):
|
||||||
|
return {"slug": payload["slug"]}
|
||||||
|
|
||||||
|
def publish_post(self, slug):
|
||||||
|
self.slug = slug
|
||||||
|
|
||||||
|
with TemporaryDirectory() as temp_dir:
|
||||||
|
history_path = Path(temp_dir) / "published_urls.json"
|
||||||
|
result = run_daily_report(
|
||||||
|
run_date="2026-06-08",
|
||||||
|
mode="publish",
|
||||||
|
source_mode="mock",
|
||||||
|
llm_mode="mock",
|
||||||
|
out_dir=Path(temp_dir) / "out",
|
||||||
|
base_url="https://blog.example",
|
||||||
|
env={"BLOG_SERVICE_TOKEN": "token"},
|
||||||
|
blog_client_factory=FakeBlogClient,
|
||||||
|
history_path=history_path,
|
||||||
|
)
|
||||||
|
history = load_published_urls(history_path)
|
||||||
|
|
||||||
|
self.assertEqual(result["reports"]["stage8"]["status"], "ok")
|
||||||
|
self.assertIn("https://example.com/gpt5", history.urls)
|
||||||
|
self.assertEqual(history.urls["https://example.com/gpt5"].last_published, "2026-06-08")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ import json
|
|||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from ai_daily_report.pipeline import run_stage0_to_stage4
|
from ai_daily_report.pipeline import run_stage0_to_stage4
|
||||||
|
from ai_daily_report.models import PublishedUrlEntry, PublishedUrls
|
||||||
|
|
||||||
|
|
||||||
class Stage0To4PipelineTests(unittest.TestCase):
|
class Stage0To4PipelineTests(unittest.TestCase):
|
||||||
@@ -61,6 +62,71 @@ class Stage0To4PipelineTests(unittest.TestCase):
|
|||||||
self.assertIn("stage4", result["reports"])
|
self.assertIn("stage4", result["reports"])
|
||||||
self.assertEqual(result["reports"]["stage4"]["rewritten_count"], 2)
|
self.assertEqual(result["reports"]["stage4"]["rewritten_count"], 2)
|
||||||
|
|
||||||
|
def test_run_stage0_to_stage4_filters_published_urls_before_semantic_dedupe(self):
|
||||||
|
configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}]
|
||||||
|
seen_semantic_payloads = []
|
||||||
|
seen_rewrite_payloads = []
|
||||||
|
|
||||||
|
def fetcher(config, run_date):
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"title_raw": "Already published",
|
||||||
|
"summary_raw": "Old summary",
|
||||||
|
"url": "https://example.com/already",
|
||||||
|
"source_label": config.name,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title_raw": "Fresh story",
|
||||||
|
"summary_raw": "Fresh summary",
|
||||||
|
"url": "https://example.com/fresh",
|
||||||
|
"source_label": config.name,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
def semantic_llm_call(prompt):
|
||||||
|
seen_semantic_payloads.append(json.loads(prompt))
|
||||||
|
return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
|
||||||
|
|
||||||
|
def rewrite_llm_call(prompt):
|
||||||
|
payload = json.loads(prompt)
|
||||||
|
seen_rewrite_payloads.append(payload)
|
||||||
|
return json.dumps(
|
||||||
|
{
|
||||||
|
"rewrites": [
|
||||||
|
{
|
||||||
|
"id": entry["id"],
|
||||||
|
"title": entry["title_raw"],
|
||||||
|
"summary": entry["summary_raw"],
|
||||||
|
"flags": [],
|
||||||
|
}
|
||||||
|
for entry in payload["items"]
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
published_urls = PublishedUrls(
|
||||||
|
urls={
|
||||||
|
"https://example.com/already": PublishedUrlEntry(
|
||||||
|
first_seen="2026-06-07",
|
||||||
|
last_published="2026-06-07",
|
||||||
|
titles=["Already published"],
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
result = run_stage0_to_stage4(
|
||||||
|
configs,
|
||||||
|
"2026-06-08",
|
||||||
|
fetcher=fetcher,
|
||||||
|
semantic_llm_call=semantic_llm_call,
|
||||||
|
rewrite_llm_call=rewrite_llm_call,
|
||||||
|
published_urls=published_urls,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual([entry.title_raw for entry in result["items"]], ["Fresh story"])
|
||||||
|
self.assertEqual(result["reports"]["stage2_5"]["removed_count"], 1)
|
||||||
|
self.assertEqual([entry["title_raw"] for entry in seen_rewrite_payloads[0]["items"]], ["Fresh story"])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from ai_daily_report.dedupe import hard_dedup_items
|
from ai_daily_report.dedupe import cross_day_dedup_items, hard_dedup_items
|
||||||
from ai_daily_report.models import NewsItem
|
from ai_daily_report.models import NewsItem, PublishedUrlEntry, PublishedUrls
|
||||||
|
|
||||||
|
|
||||||
def item(
|
def item(
|
||||||
@@ -58,6 +58,72 @@ class Stage2DedupeTests(unittest.TestCase):
|
|||||||
self.assertEqual(len(report["possible_duplicates"]), 1)
|
self.assertEqual(len(report["possible_duplicates"]), 1)
|
||||||
self.assertEqual(set(report["possible_duplicates"][0]["item_ids"]), {"a", "b"})
|
self.assertEqual(set(report["possible_duplicates"][0]["item_ids"]), {"a", "b"})
|
||||||
|
|
||||||
|
def test_hard_dedup_marks_lower_similarity_mixed_language_titles_as_candidates(self):
|
||||||
|
items = [
|
||||||
|
item("a", "OpenAI custom chip lead Clive Chan joins Anthropic", "openai定制芯片核心成员clivechan跳槽至anthropic", "https://example.com/a", "https://example.com/a"),
|
||||||
|
item("b", "OpenAI chip core member defects to Anthropic before mass production", "openai芯片核心叛逃anthropic就在量产前夜", "https://example.com/b", "https://example.com/b"),
|
||||||
|
]
|
||||||
|
|
||||||
|
deduped, report = hard_dedup_items(items)
|
||||||
|
|
||||||
|
self.assertEqual(len(deduped), 2)
|
||||||
|
self.assertEqual(report["removed_count"], 0)
|
||||||
|
self.assertEqual(len(report["possible_duplicates"]), 1)
|
||||||
|
self.assertEqual(set(report["possible_duplicates"][0]["item_ids"]), {"a", "b"})
|
||||||
|
|
||||||
|
def test_cross_day_dedup_filters_recently_published_canonical_urls_only(self):
|
||||||
|
items = [
|
||||||
|
item("old", "Old URL", "oldurl", "https://example.com/old", "https://example.com/old"),
|
||||||
|
item("new", "New URL", "newurl", "https://example.com/new", "https://example.com/new"),
|
||||||
|
item("missing", "Missing URL", "missingurl", "", ""),
|
||||||
|
]
|
||||||
|
published_urls = PublishedUrls(
|
||||||
|
urls={
|
||||||
|
"https://example.com/old": PublishedUrlEntry(
|
||||||
|
first_seen="2026-06-07",
|
||||||
|
last_published="2026-06-07",
|
||||||
|
titles=["Old URL"],
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
deduped, report = cross_day_dedup_items(
|
||||||
|
items,
|
||||||
|
published_urls,
|
||||||
|
run_date="2026-06-08",
|
||||||
|
max_age_days=7,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual([entry.id for entry in deduped], ["new", "missing"])
|
||||||
|
self.assertEqual(report["input_count"], 3)
|
||||||
|
self.assertEqual(report["output_count"], 2)
|
||||||
|
self.assertEqual(report["removed_count"], 1)
|
||||||
|
self.assertEqual(report["removed"][0]["item_id"], "old")
|
||||||
|
|
||||||
|
def test_cross_day_dedup_ignores_urls_outside_history_window(self):
|
||||||
|
items = [
|
||||||
|
item("stale", "Stale URL", "staleurl", "https://example.com/stale", "https://example.com/stale"),
|
||||||
|
]
|
||||||
|
published_urls = PublishedUrls(
|
||||||
|
urls={
|
||||||
|
"https://example.com/stale": PublishedUrlEntry(
|
||||||
|
first_seen="2026-05-01",
|
||||||
|
last_published="2026-05-01",
|
||||||
|
titles=["Stale URL"],
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
deduped, report = cross_day_dedup_items(
|
||||||
|
items,
|
||||||
|
published_urls,
|
||||||
|
run_date="2026-06-08",
|
||||||
|
max_age_days=7,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual([entry.id for entry in deduped], ["stale"])
|
||||||
|
self.assertEqual(report["removed_count"], 0)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
import unittest
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
from tempfile import TemporaryDirectory
|
||||||
|
|
||||||
from ai_daily_report.publish import publish_markdown
|
from ai_daily_report.models import NewsItem
|
||||||
|
from ai_daily_report.publish import load_published_urls, publish_markdown, update_published_urls
|
||||||
|
|
||||||
|
|
||||||
class FakeBlogClient:
|
class FakeBlogClient:
|
||||||
@@ -71,6 +74,46 @@ class Stage8PublishTests(unittest.TestCase):
|
|||||||
self.assertEqual(client.published_slug, "ai-2026-06-04")
|
self.assertEqual(client.published_slug, "ai-2026-06-04")
|
||||||
self.assertEqual(result.blog_url, "https://blog.example/posts/ai-2026-06-04")
|
self.assertEqual(result.blog_url, "https://blog.example/posts/ai-2026-06-04")
|
||||||
|
|
||||||
|
def test_update_published_urls_writes_canonical_urls_for_final_items(self):
|
||||||
|
with TemporaryDirectory() as temp_dir:
|
||||||
|
history_path = Path(temp_dir) / "published_urls.json"
|
||||||
|
items = [
|
||||||
|
NewsItem(
|
||||||
|
id="a",
|
||||||
|
source_group="AI HOT",
|
||||||
|
source_label="AI HOT",
|
||||||
|
source_role="primary",
|
||||||
|
source_priority=10,
|
||||||
|
title_raw="Fresh story",
|
||||||
|
title_norm="freshstory",
|
||||||
|
summary_raw="summary",
|
||||||
|
url="https://example.com/fresh?utm_source=x",
|
||||||
|
canonical_url="https://example.com/fresh",
|
||||||
|
title="Fresh story",
|
||||||
|
),
|
||||||
|
NewsItem(
|
||||||
|
id="missing",
|
||||||
|
source_group="AI HOT",
|
||||||
|
source_label="AI HOT",
|
||||||
|
source_role="primary",
|
||||||
|
source_priority=10,
|
||||||
|
title_raw="Missing URL",
|
||||||
|
title_norm="missingurl",
|
||||||
|
summary_raw="summary",
|
||||||
|
url="",
|
||||||
|
canonical_url="",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
update_published_urls(history_path, items, run_date="2026-06-08", max_age_days=7)
|
||||||
|
loaded = load_published_urls(history_path)
|
||||||
|
|
||||||
|
self.assertIn("https://example.com/fresh", loaded.urls)
|
||||||
|
self.assertNotIn("", loaded.urls)
|
||||||
|
self.assertEqual(loaded.urls["https://example.com/fresh"].first_seen, "2026-06-08")
|
||||||
|
self.assertEqual(loaded.urls["https://example.com/fresh"].last_published, "2026-06-08")
|
||||||
|
self.assertEqual(loaded.urls["https://example.com/fresh"].titles, ["Fresh story"])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
Reference in New Issue
Block a user