ai-daily-report/ai_daily_report/pipeline.py

from __future__ import annotations

from typing import Any

from .assemble import assemble_markdown
from .candidate_recall import recall_semantic_candidates
from .classify import classify_and_order_items
from .collect import Fetcher, collect_sources
from .dedupe import cross_day_dedup_items, hard_dedup_items
from .guide import GuideLlmCall, generate_guide
from .models import PublishedUrls, SourceConfig
from .normalize import normalize_items
from .publish import BlogClient, publish_markdown
from .quality_gate import evaluate_quality_gate
from .rewrite import RewriteLlmCall, rewrite_items
from .semantic_dedupe import SemanticLlmCall, semantic_dedup_items


def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig:
    max_item_age_days = value.get("max_item_age_days")
    return SourceConfig(
        name=value["name"],
        type=value["type"],
        role=value.get("role", "supplement"),
        priority=int(value.get("priority", 100)),
        required=bool(value.get("required", False)),
        enabled=bool(value.get("enabled", True)),
        timeout_seconds=int(value.get("timeout_seconds", 25)),
        retries=int(value.get("retries", 0)),
        min_items=int(value.get("min_items", 0)),
        url=value.get("url", ""),
        max_item_age_days=int(max_item_age_days) if max_item_age_days is not None else None,
        failure_policy=str(value.get("failure_policy") or ("block" if bool(value.get("required", False)) else "warn")),
    )


def run_stage0_to_stage2(
    source_configs: list[dict[str, Any] | SourceConfig],
    run_date: str,
    *,
    fetcher: Fetcher,
) -> dict[str, Any]:
    configs = [
        config if isinstance(config, SourceConfig) else _source_config_from_dict(config)
        for config in source_configs
    ]
    source_results, stage0_report = collect_sources(configs, run_date, fetcher=fetcher)
    source_priorities = {config.name: config.priority for config in configs}
    normalized_items, stage1_report = normalize_items(
        source_results,
        run_date=run_date,
        source_priorities=source_priorities,
    )
    deduped_items, stage2_report = hard_dedup_items(normalized_items)
    artifacts = {
        "stage0_sources": source_results,
        "stage1_items": normalized_items,
        "stage2_items": deduped_items,
    }
    return {
        "source_results": source_results,
        "items": deduped_items,
        "reports": {
            "stage0": stage0_report,
            "stage1": stage1_report,
            "stage2": stage2_report,
        },
        "artifacts": artifacts,
    }


def run_stage0_to_stage2_5(
    source_configs: list[dict[str, Any] | SourceConfig],
    run_date: str,
    *,
    fetcher: Fetcher,
    published_urls: PublishedUrls | None = None,
    cross_day_dedup_enabled: bool = True,
    cross_day_dedup_max_age_days: int = 7,
) -> dict[str, Any]:
    stage2_result = run_stage0_to_stage2(source_configs, run_date, fetcher=fetcher)
    if cross_day_dedup_enabled:
        items, stage2_5_report = cross_day_dedup_items(
            stage2_result["items"],
            published_urls,
            run_date=run_date,
            max_age_days=cross_day_dedup_max_age_days,
        )
    else:
        items = stage2_result["items"]
        stage2_5_report = {
            "input_count": len(items),
            "output_count": len(items),
            "removed_count": 0,
            "removed": [],
            "enabled": False,
            "max_age_days": cross_day_dedup_max_age_days,
        }
    reports = dict(stage2_result["reports"])
    stage2_5_report.setdefault("enabled", cross_day_dedup_enabled)
    reports["stage2_5"] = stage2_5_report
    artifacts = dict(stage2_result.get("artifacts", {}))
    artifacts["stage2_5_items"] = items
    return {
        "source_results": stage2_result["source_results"],
        "items": items,
        "reports": reports,
        "artifacts": artifacts,
    }


def run_stage0_to_stage4(
    source_configs: list[dict[str, Any] | SourceConfig],
    run_date: str,
    *,
    fetcher: Fetcher,
    semantic_llm_call: SemanticLlmCall,
    rewrite_llm_call: RewriteLlmCall,
    published_urls: PublishedUrls | None = None,
    cross_day_dedup_enabled: bool = True,
    cross_day_dedup_max_age_days: int = 7,
    semantic_dedup_max_deletion_ratio: float = 0.5,
    rewrite_batch_size: int = 30,
    semantic_candidate_recall_config: dict[str, Any] | None = None,
    quality_gate_config: dict[str, Any] | None = None,
) -> dict[str, Any]:
    stage2_5_result = run_stage0_to_stage2_5(
        source_configs,
        run_date,
        fetcher=fetcher,
        published_urls=published_urls,
        cross_day_dedup_enabled=cross_day_dedup_enabled,
        cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
    )
    items = stage2_5_result["items"]
    remaining_ids = {item.id for item in items}
    candidates = [
        candidate
        for candidate in stage2_5_result["reports"]["stage2"].get("possible_duplicates", [])
        if set(candidate.get("item_ids", [])).issubset(remaining_ids)
    ]
    candidates, stage2_8_report = recall_semantic_candidates(
        items,
        existing_candidates=candidates,
        config=semantic_candidate_recall_config,
    )
    semantic_items, stage3_report = semantic_dedup_items(
        items,
        candidates,
        llm_call=semantic_llm_call,
        max_deletion_ratio=semantic_dedup_max_deletion_ratio,
    )
    rewritten_items, stage4_report = rewrite_items(
        semantic_items,
        llm_call=rewrite_llm_call,
        batch_size=rewrite_batch_size,
    )
    reports = dict(stage2_5_result["reports"])
    reports["stage2_8"] = stage2_8_report
    reports["stage3"] = stage3_report
    reports["stage4"] = stage4_report
    artifacts = dict(stage2_5_result.get("artifacts", {}))
    artifacts["stage2_8_candidates"] = candidates
    artifacts["stage3_items"] = semantic_items
    artifacts["stage4_items"] = rewritten_items
    return {
        "source_results": stage2_5_result["source_results"],
        "items": rewritten_items,
        "reports": reports,
        "artifacts": artifacts,
    }


def run_stage0_to_stage5(
    source_configs: list[dict[str, Any] | SourceConfig],
    run_date: str,
    *,
    fetcher: Fetcher,
    semantic_llm_call: SemanticLlmCall,
    rewrite_llm_call: RewriteLlmCall,
    published_urls: PublishedUrls | None = None,
    cross_day_dedup_enabled: bool = True,
    cross_day_dedup_max_age_days: int = 7,
    semantic_dedup_max_deletion_ratio: float = 0.5,
    rewrite_batch_size: int = 30,
    semantic_candidate_recall_config: dict[str, Any] | None = None,
    quality_gate_config: dict[str, Any] | None = None,
) -> dict[str, Any]:
    stage4_result = run_stage0_to_stage4(
        source_configs,
        run_date,
        fetcher=fetcher,
        semantic_llm_call=semantic_llm_call,
        rewrite_llm_call=rewrite_llm_call,
        published_urls=published_urls,
        cross_day_dedup_enabled=cross_day_dedup_enabled,
        cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
        semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio,
        rewrite_batch_size=rewrite_batch_size,
        semantic_candidate_recall_config=semantic_candidate_recall_config,
    )
    classified_items, stage5_report = classify_and_order_items(stage4_result["items"])
    reports = dict(stage4_result["reports"])
    reports["stage5"] = stage5_report
    return {
        "source_results": stage4_result["source_results"],
        "items": classified_items,
        "reports": reports,
        "artifacts": stage4_result.get("artifacts", {}),
    }


def run_stage0_to_stage6(
    source_configs: list[dict[str, Any] | SourceConfig],
    run_date: str,
    *,
    fetcher: Fetcher,
    semantic_llm_call: SemanticLlmCall,
    rewrite_llm_call: RewriteLlmCall,
    guide_llm_call: GuideLlmCall,
    published_urls: PublishedUrls | None = None,
    cross_day_dedup_enabled: bool = True,
    cross_day_dedup_max_age_days: int = 7,
    semantic_dedup_max_deletion_ratio: float = 0.5,
    rewrite_batch_size: int = 30,
    semantic_candidate_recall_config: dict[str, Any] | None = None,
) -> dict[str, Any]:
    stage5_result = run_stage0_to_stage5(
        source_configs,
        run_date,
        fetcher=fetcher,
        semantic_llm_call=semantic_llm_call,
        rewrite_llm_call=rewrite_llm_call,
        published_urls=published_urls,
        cross_day_dedup_enabled=cross_day_dedup_enabled,
        cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
        semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio,
        rewrite_batch_size=rewrite_batch_size,
        semantic_candidate_recall_config=semantic_candidate_recall_config,
    )
    guide, stage6_report = generate_guide(stage5_result["items"], llm_call=guide_llm_call)
    reports = dict(stage5_result["reports"])
    reports["stage6"] = stage6_report
    return {
        "source_results": stage5_result["source_results"],
        "items": stage5_result["items"],
        "guide": guide,
        "reports": reports,
        "artifacts": stage5_result.get("artifacts", {}),
    }


def run_stage0_to_stage7(
    source_configs: list[dict[str, Any] | SourceConfig],
    run_date: str,
    *,
    fetcher: Fetcher,
    semantic_llm_call: SemanticLlmCall,
    rewrite_llm_call: RewriteLlmCall,
    guide_llm_call: GuideLlmCall,
    published_urls: PublishedUrls | None = None,
    cross_day_dedup_enabled: bool = True,
    cross_day_dedup_max_age_days: int = 7,
    semantic_dedup_max_deletion_ratio: float = 0.5,
    rewrite_batch_size: int = 30,
    semantic_candidate_recall_config: dict[str, Any] | None = None,
    quality_gate_config: dict[str, Any] | None = None,
) -> dict[str, Any]:
    stage6_result = run_stage0_to_stage6(
        source_configs,
        run_date,
        fetcher=fetcher,
        semantic_llm_call=semantic_llm_call,
        rewrite_llm_call=rewrite_llm_call,
        guide_llm_call=guide_llm_call,
        published_urls=published_urls,
        cross_day_dedup_enabled=cross_day_dedup_enabled,
        cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
        semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio,
        rewrite_batch_size=rewrite_batch_size,
        semantic_candidate_recall_config=semantic_candidate_recall_config,
    )
    markdown, stage7_report = assemble_markdown(stage6_result["items"], stage6_result["guide"])
    upstream_blocking_errors: list[str] = []
    for stage_name in ("stage3", "stage4", "stage5", "stage6"):
        for error in stage6_result["reports"].get(stage_name, {}).get("blocking_errors", []) or []:
            upstream_blocking_errors.append(str(error))
    if upstream_blocking_errors:
        existing_errors = list(stage7_report.get("blocking_errors", []) or [])
        stage7_report["blocking_errors"] = existing_errors + upstream_blocking_errors
    reports = dict(stage6_result["reports"])
    quality_gate_report = evaluate_quality_gate(
        stage6_result["items"],
        source_results=stage6_result["source_results"],
        reports=reports,
        config=quality_gate_config,
    )
    if quality_gate_report.get("blocking_errors"):
        existing_errors = list(stage7_report.get("blocking_errors", []) or [])
        stage7_report["blocking_errors"] = existing_errors + list(quality_gate_report["blocking_errors"])
    reports["quality_gate"] = quality_gate_report
    reports["stage7"] = stage7_report
    artifacts = dict(stage6_result.get("artifacts", {}))
    artifacts["quality_gate"] = quality_gate_report
    return {
        "source_results": stage6_result["source_results"],
        "items": stage6_result["items"],
        "guide": stage6_result["guide"],
        "markdown": markdown,
        "reports": reports,
        "artifacts": artifacts,
    }


def run_stage0_to_stage8(
    source_configs: list[dict[str, Any] | SourceConfig],
    run_date: str,
    *,
    fetcher: Fetcher,
    semantic_llm_call: SemanticLlmCall,
    rewrite_llm_call: RewriteLlmCall,
    guide_llm_call: GuideLlmCall,
    mode: str,
    base_url: str,
    client: BlogClient | None,
    published_urls: PublishedUrls | None = None,
    cross_day_dedup_enabled: bool = True,
    cross_day_dedup_max_age_days: int = 7,
    semantic_dedup_max_deletion_ratio: float = 0.5,
    rewrite_batch_size: int = 30,
    semantic_candidate_recall_config: dict[str, Any] | None = None,
    quality_gate_config: dict[str, Any] | None = None,
    publish_idempotency_config: dict[str, Any] | None = None,
) -> dict[str, Any]:
    stage7_result = run_stage0_to_stage7(
        source_configs,
        run_date,
        fetcher=fetcher,
        semantic_llm_call=semantic_llm_call,
        rewrite_llm_call=rewrite_llm_call,
        guide_llm_call=guide_llm_call,
        published_urls=published_urls,
        cross_day_dedup_enabled=cross_day_dedup_enabled,
        cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
        semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio,
        rewrite_batch_size=rewrite_batch_size,
        semantic_candidate_recall_config=semantic_candidate_recall_config,
        quality_gate_config=quality_gate_config,
    )
    slug = f"ai-{run_date}"
    effective_mode = mode
    quality_gate_report = stage7_result["reports"].get("quality_gate", {}) or {}
    required_policy = str(quality_gate_report.get("required_source_failure_policy") or "block")
    if quality_gate_report.get("required_source_failures") and required_policy in {"draft", "dry_run"}:
        effective_mode = "dry-run" if required_policy == "dry_run" else "draft"

    publish_result = publish_markdown(
        title=f"AI日报 · {run_date}",
        markdown=stage7_result["markdown"],
        tags=["AI日报", "AI资讯", "人工智能"],
        slug=slug,
        base_url=base_url,
        mode=effective_mode,
        markdown_report=stage7_result["reports"]["stage7"],
        client=client,
        idempotency_config=publish_idempotency_config,
    )
    reports = dict(stage7_result["reports"])
    reports["stage8"] = {
        "requested_mode": mode,
        "mode": publish_result.mode,
        "status": publish_result.status,
        "slug": publish_result.slug,
        "blog_url": publish_result.blog_url,
        "public_ok": publish_result.public_ok,
        "error": publish_result.error,
    }
    return {
        "source_results": stage7_result["source_results"],
        "items": stage7_result["items"],
        "guide": stage7_result["guide"],
        "markdown": stage7_result["markdown"],
        "publish": publish_result,
        "reports": reports,
        "artifacts": stage7_result.get("artifacts", {}),
    }