from __future__ import annotations import json from dataclasses import asdict, is_dataclass from pathlib import Path from typing import Any from .clients import BlogApiClient, OpenAICompatibleClient, fetch_text as default_fetch_text from .config import load_pipeline_config, load_source_configs from .env import load_env, resolve_blog_token, resolve_llm_config from .models import SourceConfig from .observability import LlmCallObserver, summarize_observed_calls from .pipeline import run_stage0_to_stage8 from .publish import load_published_urls, update_published_urls from .sources.registry import get_source_fetcher def _json_default(value: Any): if is_dataclass(value): return asdict(value) raise TypeError(f"Object is not JSON serializable: {type(value).__name__}") def _mock_source_configs() -> list[SourceConfig]: return [SourceConfig(name="Mock AI HOT", type="mock", role="primary", priority=10)] def _mock_fetcher(config: SourceConfig, run_date: str) -> list[dict[str, Any]]: return [ { "title_raw": "GPT-5 API 发布", "summary_raw": "OpenAI 发布 GPT-5 API,用于本地 mock 测试。", "url": "https://example.com/gpt5", "source_label": "OpenAI:Blog", "section_hint": "模型发布/更新", "origin_type": "mock", "language_hint": "zh", } ] def _mock_semantic_llm(prompt: str) -> str: return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []}, ensure_ascii=False) def _mock_rewrite_llm(prompt: str) -> str: payload = json.loads(prompt) return json.dumps( { "rewrites": [ { "id": item["id"], "title": item["title_raw"], "summary": item["summary_raw"], "flags": [], } for item in payload["items"] ] }, ensure_ascii=False, ) def _mock_guide_llm(prompt: str) -> str: payload = json.loads(prompt) item_ids = [item["id"] for item in payload["items"][:3]] return json.dumps( { "intro": "本地 mock 模式已生成 AI 日报,用于验证流水线。", "theme": "本地 mock 模式已生成 AI 日报,用于验证流水线。", "threads": [ { "title": "本地链路验证", "text": "采集、改写、分类、导览、Markdown 和发布报告都已通过 mock 数据串联。", "item_ids": item_ids, "kind": "thread", } ], "conclusion": "本地 mock 结果可用于确认定时任务入口和文件输出是否正常。", }, ensure_ascii=False, ) def run_daily_report( *, run_date: str, mode: str, source_mode: str, llm_mode: str, out_dir: Path, base_url: str, sources_path: Path | None = None, pipeline_path: Path | None = None, history_path: Path | None = None, fetch_text=None, env: dict[str, str] | None = None, llm_client_factory=OpenAICompatibleClient, blog_client_factory=BlogApiClient, ) -> dict[str, Any]: fetch_text = fetch_text or default_fetch_text env = env if env is not None else load_env() pipeline_config_path = pipeline_path or Path("config") / "pipeline.json" pipeline_config = load_pipeline_config(pipeline_config_path) cross_day_config = pipeline_config.get("cross_day_dedup", {}) or {} cross_day_enabled = bool(cross_day_config.get("enabled", True)) cross_day_max_age_days = int(cross_day_config.get("max_age_days", 7)) semantic_dedup_max_deletion_ratio = float(pipeline_config.get("semantic_dedup_max_deletion_ratio", 0.5)) rewrite_batch_size = int(pipeline_config.get("rewrite_batch_size", 30)) semantic_candidate_recall_config = pipeline_config.get("semantic_candidate_recall", {}) or {} quality_gate_config = pipeline_config.get("quality_gate", {}) or {} publish_idempotency_config = pipeline_config.get("publish_idempotency", {}) or {} configured_history_path = history_path or Path( str(cross_day_config.get("history_path") or "~/.hermes/scripts/ai_morning_out/published_urls.json") ).expanduser() published_urls = load_published_urls(configured_history_path) if cross_day_enabled else None if source_mode == "mock": source_configs = _mock_source_configs() fetcher = _mock_fetcher elif source_mode == "live": if sources_path is None: sources_path = Path("config") / "sources.json" source_configs = load_source_configs(sources_path) def fetcher(config: SourceConfig, current_date: str) -> list[dict[str, Any]]: source_fetcher = get_source_fetcher(config.type) def configured_fetch_text(url: str, timeout_seconds: int) -> str: try: return fetch_text(url, timeout_seconds, retries=config.retries) except TypeError: return fetch_text(url, timeout_seconds) return source_fetcher(config, current_date, configured_fetch_text) else: raise ValueError("source_mode must be 'mock' or 'live'") llm_observability_config = pipeline_config.get("llm_observability", {}) or {} llm_observers: list[LlmCallObserver] = [] observe_llm = bool(llm_observability_config.get("enabled", True)) prompt_preview_chars = int(llm_observability_config.get("prompt_preview_chars", 500)) response_preview_chars = int(llm_observability_config.get("response_preview_chars", 500)) def maybe_observe(stage: str, call): if not observe_llm: return call observer = LlmCallObserver( call=call, stage=stage, prompt_preview_chars=prompt_preview_chars, response_preview_chars=response_preview_chars, ) llm_observers.append(observer) return observer if llm_mode == "mock": semantic_llm_call = maybe_observe("stage3", _mock_semantic_llm) rewrite_llm_call = maybe_observe("stage4", _mock_rewrite_llm) guide_llm_call = maybe_observe("stage6", _mock_guide_llm) elif llm_mode == "live": llm_client = llm_client_factory(**resolve_llm_config(env)) semantic_llm_call = maybe_observe("stage3", llm_client.chat) rewrite_llm_call = maybe_observe("stage4", llm_client.chat) guide_llm_call = maybe_observe("stage6", llm_client.chat) else: raise ValueError("llm_mode must be 'mock' or 'live'") blog_client = None if mode in ("draft", "publish"): token = resolve_blog_token(env) if not token: raise ValueError("missing_blog_token: set BLOG_SERVICE_TOKEN or EPHRON_SERVICE_TOKEN") blog_client = blog_client_factory(base_url=base_url, token=token) result = run_stage0_to_stage8( source_configs, run_date, fetcher=fetcher, semantic_llm_call=semantic_llm_call, rewrite_llm_call=rewrite_llm_call, guide_llm_call=guide_llm_call, mode=mode, base_url=base_url, client=blog_client, published_urls=published_urls, cross_day_dedup_enabled=cross_day_enabled, cross_day_dedup_max_age_days=cross_day_max_age_days, semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio, rewrite_batch_size=rewrite_batch_size, semantic_candidate_recall_config=semantic_candidate_recall_config, quality_gate_config=quality_gate_config, publish_idempotency_config=publish_idempotency_config, ) if cross_day_enabled and result["publish"].mode == "publish" and result["publish"].status == "ok": update_published_urls( configured_history_path, result["items"], run_date=run_date, max_age_days=cross_day_max_age_days, ) llm_observability_report = summarize_observed_calls(llm_observers) result["reports"]["llm_observability"] = llm_observability_report run_dir = out_dir / run_date run_dir.mkdir(parents=True, exist_ok=True) (run_dir / "blog_markdown.md").write_text(result["markdown"], encoding="utf-8") (run_dir / "run_report.json").write_text( json.dumps(result["reports"], ensure_ascii=False, indent=2, default=_json_default), encoding="utf-8", ) for artifact_name, artifact_value in result.get("artifacts", {}).items(): (run_dir / f"{artifact_name}.json").write_text( json.dumps(artifact_value, ensure_ascii=False, indent=2, default=_json_default), encoding="utf-8", ) return { "run_dir": str(run_dir), "markdown": result["markdown"], "reports": result["reports"], "publish": result["publish"], "artifacts": result.get("artifacts", {}), }