Refactor AI daily report pipeline

2026-06-04 15:21:56 +08:00
parent 94e18ce22d
commit 5a98696255
64 changed files with 4778 additions and 1316 deletions
--- a/ai_daily_report/pipeline.py
+++ b/ai_daily_report/pipeline.py
@@ -0,0 +1,219 @@
+from __future__ import annotations
+
+from typing import Any
+
+from .assemble import assemble_markdown
+from .classify import classify_and_order_items
+from .collect import Fetcher, collect_sources
+from .dedupe import hard_dedup_items
+from .guide import GuideLlmCall, generate_guide
+from .models import SourceConfig
+from .normalize import normalize_items
+from .publish import BlogClient, publish_markdown
+from .rewrite import RewriteLlmCall, rewrite_items
+from .semantic_dedupe import SemanticLlmCall, semantic_dedup_items
+
+
+def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig:
+    return SourceConfig(
+        name=value["name"],
+        type=value["type"],
+        role=value.get("role", "supplement"),
+        priority=int(value.get("priority", 100)),
+        required=bool(value.get("required", False)),
+        enabled=bool(value.get("enabled", True)),
+        timeout_seconds=int(value.get("timeout_seconds", 25)),
+        retries=int(value.get("retries", 0)),
+        min_items=int(value.get("min_items", 0)),
+        url=value.get("url", ""),
+    )
+
+
+def run_stage0_to_stage2(
+    source_configs: list[dict[str, Any] | SourceConfig],
+    run_date: str,
+    *,
+    fetcher: Fetcher,
+) -> dict[str, Any]:
+    configs = [
+        config if isinstance(config, SourceConfig) else _source_config_from_dict(config)
+        for config in source_configs
+    ]
+    source_results, stage0_report = collect_sources(configs, run_date, fetcher=fetcher)
+    source_priorities = {config.name: config.priority for config in configs}
+    normalized_items, stage1_report = normalize_items(
+        source_results,
+        run_date=run_date,
+        source_priorities=source_priorities,
+    )
+    deduped_items, stage2_report = hard_dedup_items(normalized_items)
+    return {
+        "source_results": source_results,
+        "items": deduped_items,
+        "reports": {
+            "stage0": stage0_report,
+            "stage1": stage1_report,
+            "stage2": stage2_report,
+        },
+    }
+
+
+def run_stage0_to_stage4(
+    source_configs: list[dict[str, Any] | SourceConfig],
+    run_date: str,
+    *,
+    fetcher: Fetcher,
+    semantic_llm_call: SemanticLlmCall,
+    rewrite_llm_call: RewriteLlmCall,
+) -> dict[str, Any]:
+    stage2_result = run_stage0_to_stage2(source_configs, run_date, fetcher=fetcher)
+    items = stage2_result["items"]
+    candidates = stage2_result["reports"]["stage2"].get("possible_duplicates", [])
+    semantic_items, stage3_report = semantic_dedup_items(
+        items,
+        candidates,
+        llm_call=semantic_llm_call,
+    )
+    rewritten_items, stage4_report = rewrite_items(
+        semantic_items,
+        llm_call=rewrite_llm_call,
+    )
+    reports = dict(stage2_result["reports"])
+    reports["stage3"] = stage3_report
+    reports["stage4"] = stage4_report
+    return {
+        "source_results": stage2_result["source_results"],
+        "items": rewritten_items,
+        "reports": reports,
+    }
+
+
+def run_stage0_to_stage5(
+    source_configs: list[dict[str, Any] | SourceConfig],
+    run_date: str,
+    *,
+    fetcher: Fetcher,
+    semantic_llm_call: SemanticLlmCall,
+    rewrite_llm_call: RewriteLlmCall,
+) -> dict[str, Any]:
+    stage4_result = run_stage0_to_stage4(
+        source_configs,
+        run_date,
+        fetcher=fetcher,
+        semantic_llm_call=semantic_llm_call,
+        rewrite_llm_call=rewrite_llm_call,
+    )
+    classified_items, stage5_report = classify_and_order_items(stage4_result["items"])
+    reports = dict(stage4_result["reports"])
+    reports["stage5"] = stage5_report
+    return {
+        "source_results": stage4_result["source_results"],
+        "items": classified_items,
+        "reports": reports,
+    }
+
+
+def run_stage0_to_stage6(
+    source_configs: list[dict[str, Any] | SourceConfig],
+    run_date: str,
+    *,
+    fetcher: Fetcher,
+    semantic_llm_call: SemanticLlmCall,
+    rewrite_llm_call: RewriteLlmCall,
+    guide_llm_call: GuideLlmCall,
+) -> dict[str, Any]:
+    stage5_result = run_stage0_to_stage5(
+        source_configs,
+        run_date,
+        fetcher=fetcher,
+        semantic_llm_call=semantic_llm_call,
+        rewrite_llm_call=rewrite_llm_call,
+    )
+    guide, stage6_report = generate_guide(stage5_result["items"], llm_call=guide_llm_call)
+    reports = dict(stage5_result["reports"])
+    reports["stage6"] = stage6_report
+    return {
+        "source_results": stage5_result["source_results"],
+        "items": stage5_result["items"],
+        "guide": guide,
+        "reports": reports,
+    }
+
+
+def run_stage0_to_stage7(
+    source_configs: list[dict[str, Any] | SourceConfig],
+    run_date: str,
+    *,
+    fetcher: Fetcher,
+    semantic_llm_call: SemanticLlmCall,
+    rewrite_llm_call: RewriteLlmCall,
+    guide_llm_call: GuideLlmCall,
+) -> dict[str, Any]:
+    stage6_result = run_stage0_to_stage6(
+        source_configs,
+        run_date,
+        fetcher=fetcher,
+        semantic_llm_call=semantic_llm_call,
+        rewrite_llm_call=rewrite_llm_call,
+        guide_llm_call=guide_llm_call,
+    )
+    markdown, stage7_report = assemble_markdown(stage6_result["items"], stage6_result["guide"])
+    reports = dict(stage6_result["reports"])
+    reports["stage7"] = stage7_report
+    return {
+        "source_results": stage6_result["source_results"],
+        "items": stage6_result["items"],
+        "guide": stage6_result["guide"],
+        "markdown": markdown,
+        "reports": reports,
+    }
+
+
+def run_stage0_to_stage8(
+    source_configs: list[dict[str, Any] | SourceConfig],
+    run_date: str,
+    *,
+    fetcher: Fetcher,
+    semantic_llm_call: SemanticLlmCall,
+    rewrite_llm_call: RewriteLlmCall,
+    guide_llm_call: GuideLlmCall,
+    mode: str,
+    base_url: str,
+    client: BlogClient | None,
+) -> dict[str, Any]:
+    stage7_result = run_stage0_to_stage7(
+        source_configs,
+        run_date,
+        fetcher=fetcher,
+        semantic_llm_call=semantic_llm_call,
+        rewrite_llm_call=rewrite_llm_call,
+        guide_llm_call=guide_llm_call,
+    )
+    slug = f"ai-{run_date}"
+    publish_result = publish_markdown(
+        title=f"AI日报 · {run_date}",
+        markdown=stage7_result["markdown"],
+        tags=["AI日报", "AI资讯", "人工智能"],
+        slug=slug,
+        base_url=base_url,
+        mode=mode,
+        markdown_report=stage7_result["reports"]["stage7"],
+        client=client,
+    )
+    reports = dict(stage7_result["reports"])
+    reports["stage8"] = {
+        "mode": publish_result.mode,
+        "status": publish_result.status,
+        "slug": publish_result.slug,
+        "blog_url": publish_result.blog_url,
+        "public_ok": publish_result.public_ok,
+        "error": publish_result.error,
+    }
+    return {
+        "source_results": stage7_result["source_results"],
+        "items": stage7_result["items"],
+        "guide": stage7_result["guide"],
+        "markdown": stage7_result["markdown"],
+        "publish": publish_result,
+        "reports": reports,
+    }