from __future__ import annotations from typing import Any from .assemble import assemble_markdown from .candidate_recall import recall_semantic_candidates from .classify import classify_and_order_items from .collect import Fetcher, collect_sources from .dedupe import cross_day_dedup_items, hard_dedup_items from .guide import GuideLlmCall, generate_guide from .models import PublishedUrls, SourceConfig from .normalize import normalize_items from .publish import BlogClient, publish_markdown from .quality_gate import evaluate_quality_gate from .rewrite import RewriteLlmCall, rewrite_items from .semantic_dedupe import SemanticLlmCall, semantic_dedup_items def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig: max_item_age_days = value.get("max_item_age_days") return SourceConfig( name=value["name"], type=value["type"], role=value.get("role", "supplement"), priority=int(value.get("priority", 100)), required=bool(value.get("required", False)), enabled=bool(value.get("enabled", True)), timeout_seconds=int(value.get("timeout_seconds", 25)), retries=int(value.get("retries", 0)), min_items=int(value.get("min_items", 0)), url=value.get("url", ""), max_item_age_days=int(max_item_age_days) if max_item_age_days is not None else None, failure_policy=str(value.get("failure_policy") or ("block" if bool(value.get("required", False)) else "warn")), ) def run_stage0_to_stage2( source_configs: list[dict[str, Any] | SourceConfig], run_date: str, *, fetcher: Fetcher, ) -> dict[str, Any]: configs = [ config if isinstance(config, SourceConfig) else _source_config_from_dict(config) for config in source_configs ] source_results, stage0_report = collect_sources(configs, run_date, fetcher=fetcher) source_priorities = {config.name: config.priority for config in configs} normalized_items, stage1_report = normalize_items( source_results, run_date=run_date, source_priorities=source_priorities, ) deduped_items, stage2_report = hard_dedup_items(normalized_items) artifacts = { "stage0_sources": source_results, "stage1_items": normalized_items, "stage2_items": deduped_items, } return { "source_results": source_results, "items": deduped_items, "reports": { "stage0": stage0_report, "stage1": stage1_report, "stage2": stage2_report, }, "artifacts": artifacts, } def run_stage0_to_stage2_5( source_configs: list[dict[str, Any] | SourceConfig], run_date: str, *, fetcher: Fetcher, published_urls: PublishedUrls | None = None, cross_day_dedup_enabled: bool = True, cross_day_dedup_max_age_days: int = 7, ) -> dict[str, Any]: stage2_result = run_stage0_to_stage2(source_configs, run_date, fetcher=fetcher) if cross_day_dedup_enabled: items, stage2_5_report = cross_day_dedup_items( stage2_result["items"], published_urls, run_date=run_date, max_age_days=cross_day_dedup_max_age_days, ) else: items = stage2_result["items"] stage2_5_report = { "input_count": len(items), "output_count": len(items), "removed_count": 0, "removed": [], "enabled": False, "max_age_days": cross_day_dedup_max_age_days, } reports = dict(stage2_result["reports"]) stage2_5_report.setdefault("enabled", cross_day_dedup_enabled) reports["stage2_5"] = stage2_5_report artifacts = dict(stage2_result.get("artifacts", {})) artifacts["stage2_5_items"] = items return { "source_results": stage2_result["source_results"], "items": items, "reports": reports, "artifacts": artifacts, } def run_stage0_to_stage4( source_configs: list[dict[str, Any] | SourceConfig], run_date: str, *, fetcher: Fetcher, semantic_llm_call: SemanticLlmCall, rewrite_llm_call: RewriteLlmCall, published_urls: PublishedUrls | None = None, cross_day_dedup_enabled: bool = True, cross_day_dedup_max_age_days: int = 7, semantic_dedup_max_deletion_ratio: float = 0.5, rewrite_batch_size: int = 30, semantic_candidate_recall_config: dict[str, Any] | None = None, quality_gate_config: dict[str, Any] | None = None, ) -> dict[str, Any]: stage2_5_result = run_stage0_to_stage2_5( source_configs, run_date, fetcher=fetcher, published_urls=published_urls, cross_day_dedup_enabled=cross_day_dedup_enabled, cross_day_dedup_max_age_days=cross_day_dedup_max_age_days, ) items = stage2_5_result["items"] remaining_ids = {item.id for item in items} candidates = [ candidate for candidate in stage2_5_result["reports"]["stage2"].get("possible_duplicates", []) if set(candidate.get("item_ids", [])).issubset(remaining_ids) ] candidates, stage2_8_report = recall_semantic_candidates( items, existing_candidates=candidates, config=semantic_candidate_recall_config, ) semantic_items, stage3_report = semantic_dedup_items( items, candidates, llm_call=semantic_llm_call, max_deletion_ratio=semantic_dedup_max_deletion_ratio, ) rewritten_items, stage4_report = rewrite_items( semantic_items, llm_call=rewrite_llm_call, batch_size=rewrite_batch_size, ) reports = dict(stage2_5_result["reports"]) reports["stage2_8"] = stage2_8_report reports["stage3"] = stage3_report reports["stage4"] = stage4_report artifacts = dict(stage2_5_result.get("artifacts", {})) artifacts["stage2_8_candidates"] = candidates artifacts["stage3_items"] = semantic_items artifacts["stage4_items"] = rewritten_items return { "source_results": stage2_5_result["source_results"], "items": rewritten_items, "reports": reports, "artifacts": artifacts, } def run_stage0_to_stage5( source_configs: list[dict[str, Any] | SourceConfig], run_date: str, *, fetcher: Fetcher, semantic_llm_call: SemanticLlmCall, rewrite_llm_call: RewriteLlmCall, published_urls: PublishedUrls | None = None, cross_day_dedup_enabled: bool = True, cross_day_dedup_max_age_days: int = 7, semantic_dedup_max_deletion_ratio: float = 0.5, rewrite_batch_size: int = 30, semantic_candidate_recall_config: dict[str, Any] | None = None, quality_gate_config: dict[str, Any] | None = None, ) -> dict[str, Any]: stage4_result = run_stage0_to_stage4( source_configs, run_date, fetcher=fetcher, semantic_llm_call=semantic_llm_call, rewrite_llm_call=rewrite_llm_call, published_urls=published_urls, cross_day_dedup_enabled=cross_day_dedup_enabled, cross_day_dedup_max_age_days=cross_day_dedup_max_age_days, semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio, rewrite_batch_size=rewrite_batch_size, semantic_candidate_recall_config=semantic_candidate_recall_config, ) classified_items, stage5_report = classify_and_order_items(stage4_result["items"]) reports = dict(stage4_result["reports"]) reports["stage5"] = stage5_report return { "source_results": stage4_result["source_results"], "items": classified_items, "reports": reports, "artifacts": stage4_result.get("artifacts", {}), } def run_stage0_to_stage6( source_configs: list[dict[str, Any] | SourceConfig], run_date: str, *, fetcher: Fetcher, semantic_llm_call: SemanticLlmCall, rewrite_llm_call: RewriteLlmCall, guide_llm_call: GuideLlmCall, published_urls: PublishedUrls | None = None, cross_day_dedup_enabled: bool = True, cross_day_dedup_max_age_days: int = 7, semantic_dedup_max_deletion_ratio: float = 0.5, rewrite_batch_size: int = 30, semantic_candidate_recall_config: dict[str, Any] | None = None, ) -> dict[str, Any]: stage5_result = run_stage0_to_stage5( source_configs, run_date, fetcher=fetcher, semantic_llm_call=semantic_llm_call, rewrite_llm_call=rewrite_llm_call, published_urls=published_urls, cross_day_dedup_enabled=cross_day_dedup_enabled, cross_day_dedup_max_age_days=cross_day_dedup_max_age_days, semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio, rewrite_batch_size=rewrite_batch_size, semantic_candidate_recall_config=semantic_candidate_recall_config, ) guide, stage6_report = generate_guide(stage5_result["items"], llm_call=guide_llm_call) reports = dict(stage5_result["reports"]) reports["stage6"] = stage6_report return { "source_results": stage5_result["source_results"], "items": stage5_result["items"], "guide": guide, "reports": reports, "artifacts": stage5_result.get("artifacts", {}), } def run_stage0_to_stage7( source_configs: list[dict[str, Any] | SourceConfig], run_date: str, *, fetcher: Fetcher, semantic_llm_call: SemanticLlmCall, rewrite_llm_call: RewriteLlmCall, guide_llm_call: GuideLlmCall, published_urls: PublishedUrls | None = None, cross_day_dedup_enabled: bool = True, cross_day_dedup_max_age_days: int = 7, semantic_dedup_max_deletion_ratio: float = 0.5, rewrite_batch_size: int = 30, semantic_candidate_recall_config: dict[str, Any] | None = None, quality_gate_config: dict[str, Any] | None = None, ) -> dict[str, Any]: stage6_result = run_stage0_to_stage6( source_configs, run_date, fetcher=fetcher, semantic_llm_call=semantic_llm_call, rewrite_llm_call=rewrite_llm_call, guide_llm_call=guide_llm_call, published_urls=published_urls, cross_day_dedup_enabled=cross_day_dedup_enabled, cross_day_dedup_max_age_days=cross_day_dedup_max_age_days, semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio, rewrite_batch_size=rewrite_batch_size, semantic_candidate_recall_config=semantic_candidate_recall_config, ) markdown, stage7_report = assemble_markdown(stage6_result["items"], stage6_result["guide"]) upstream_blocking_errors: list[str] = [] for stage_name in ("stage3", "stage4", "stage5", "stage6"): for error in stage6_result["reports"].get(stage_name, {}).get("blocking_errors", []) or []: upstream_blocking_errors.append(str(error)) if upstream_blocking_errors: existing_errors = list(stage7_report.get("blocking_errors", []) or []) stage7_report["blocking_errors"] = existing_errors + upstream_blocking_errors reports = dict(stage6_result["reports"]) quality_gate_report = evaluate_quality_gate( stage6_result["items"], source_results=stage6_result["source_results"], reports=reports, config=quality_gate_config, ) if quality_gate_report.get("blocking_errors"): existing_errors = list(stage7_report.get("blocking_errors", []) or []) stage7_report["blocking_errors"] = existing_errors + list(quality_gate_report["blocking_errors"]) reports["quality_gate"] = quality_gate_report reports["stage7"] = stage7_report artifacts = dict(stage6_result.get("artifacts", {})) artifacts["quality_gate"] = quality_gate_report return { "source_results": stage6_result["source_results"], "items": stage6_result["items"], "guide": stage6_result["guide"], "markdown": markdown, "reports": reports, "artifacts": artifacts, } def run_stage0_to_stage8( source_configs: list[dict[str, Any] | SourceConfig], run_date: str, *, fetcher: Fetcher, semantic_llm_call: SemanticLlmCall, rewrite_llm_call: RewriteLlmCall, guide_llm_call: GuideLlmCall, mode: str, base_url: str, client: BlogClient | None, published_urls: PublishedUrls | None = None, cross_day_dedup_enabled: bool = True, cross_day_dedup_max_age_days: int = 7, semantic_dedup_max_deletion_ratio: float = 0.5, rewrite_batch_size: int = 30, semantic_candidate_recall_config: dict[str, Any] | None = None, quality_gate_config: dict[str, Any] | None = None, publish_idempotency_config: dict[str, Any] | None = None, ) -> dict[str, Any]: stage7_result = run_stage0_to_stage7( source_configs, run_date, fetcher=fetcher, semantic_llm_call=semantic_llm_call, rewrite_llm_call=rewrite_llm_call, guide_llm_call=guide_llm_call, published_urls=published_urls, cross_day_dedup_enabled=cross_day_dedup_enabled, cross_day_dedup_max_age_days=cross_day_dedup_max_age_days, semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio, rewrite_batch_size=rewrite_batch_size, semantic_candidate_recall_config=semantic_candidate_recall_config, quality_gate_config=quality_gate_config, ) slug = f"ai-{run_date}" effective_mode = mode quality_gate_report = stage7_result["reports"].get("quality_gate", {}) or {} required_policy = str(quality_gate_report.get("required_source_failure_policy") or "block") if quality_gate_report.get("required_source_failures") and required_policy in {"draft", "dry_run"}: effective_mode = "dry-run" if required_policy == "dry_run" else "draft" publish_result = publish_markdown( title=f"AI日报 · {run_date}", markdown=stage7_result["markdown"], tags=["AI日报", "AI资讯", "人工智能"], slug=slug, base_url=base_url, mode=effective_mode, markdown_report=stage7_result["reports"]["stage7"], client=client, idempotency_config=publish_idempotency_config, ) reports = dict(stage7_result["reports"]) reports["stage8"] = { "requested_mode": mode, "mode": publish_result.mode, "status": publish_result.status, "slug": publish_result.slug, "blog_url": publish_result.blog_url, "public_ok": publish_result.public_ok, "error": publish_result.error, } return { "source_results": stage7_result["source_results"], "items": stage7_result["items"], "guide": stage7_result["guide"], "markdown": stage7_result["markdown"], "publish": publish_result, "reports": reports, "artifacts": stage7_result.get("artifacts", {}), }