fix: add cross-day dedupe

This commit is contained in:
Mimikko-zeus
2026-06-08 12:05:45 +08:00
parent 2671aee850
commit 07786e3bc0
16 changed files with 671 additions and 21 deletions

View File

@@ -6,10 +6,11 @@ from pathlib import Path
from typing import Any
from .clients import BlogApiClient, OpenAICompatibleClient, fetch_text as default_fetch_text
from .config import load_source_configs
from .config import load_pipeline_config, load_source_configs
from .env import load_env, resolve_blog_token, resolve_llm_config
from .models import SourceConfig
from .pipeline import run_stage0_to_stage8
from .publish import load_published_urls, update_published_urls
from .sources.registry import get_source_fetcher
@@ -89,6 +90,8 @@ def run_daily_report(
out_dir: Path,
base_url: str,
sources_path: Path | None = None,
pipeline_path: Path | None = None,
history_path: Path | None = None,
fetch_text=None,
env: dict[str, str] | None = None,
llm_client_factory=OpenAICompatibleClient,
@@ -96,6 +99,15 @@ def run_daily_report(
) -> dict[str, Any]:
fetch_text = fetch_text or default_fetch_text
env = env if env is not None else load_env()
pipeline_config_path = pipeline_path or Path("config") / "pipeline.json"
pipeline_config = load_pipeline_config(pipeline_config_path)
cross_day_config = pipeline_config.get("cross_day_dedup", {}) or {}
cross_day_enabled = bool(cross_day_config.get("enabled", True))
cross_day_max_age_days = int(cross_day_config.get("max_age_days", 7))
configured_history_path = history_path or Path(
str(cross_day_config.get("history_path") or "~/.hermes/scripts/ai_morning_out/published_urls.json")
).expanduser()
published_urls = load_published_urls(configured_history_path) if cross_day_enabled else None
if source_mode == "mock":
source_configs = _mock_source_configs()
@@ -141,8 +153,19 @@ def run_daily_report(
mode=mode,
base_url=base_url,
client=blog_client,
published_urls=published_urls,
cross_day_dedup_enabled=cross_day_enabled,
cross_day_dedup_max_age_days=cross_day_max_age_days,
)
if cross_day_enabled and result["publish"].mode == "publish" and result["publish"].status == "ok":
update_published_urls(
configured_history_path,
result["items"],
run_date=run_date,
max_age_days=cross_day_max_age_days,
)
run_dir = out_dir / run_date
run_dir.mkdir(parents=True, exist_ok=True)
(run_dir / "blog_markdown.md").write_text(result["markdown"], encoding="utf-8")