387 lines
14 KiB
Python
387 lines
14 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import Any
|
|
|
|
from .assemble import assemble_markdown
|
|
from .candidate_recall import recall_semantic_candidates
|
|
from .classify import classify_and_order_items
|
|
from .collect import Fetcher, collect_sources
|
|
from .dedupe import cross_day_dedup_items, hard_dedup_items
|
|
from .guide import GuideLlmCall, generate_guide
|
|
from .models import PublishedUrls, SourceConfig
|
|
from .normalize import normalize_items
|
|
from .publish import BlogClient, publish_markdown
|
|
from .quality_gate import evaluate_quality_gate
|
|
from .rewrite import RewriteLlmCall, rewrite_items
|
|
from .semantic_dedupe import SemanticLlmCall, semantic_dedup_items
|
|
|
|
|
|
def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig:
|
|
max_item_age_days = value.get("max_item_age_days")
|
|
return SourceConfig(
|
|
name=value["name"],
|
|
type=value["type"],
|
|
role=value.get("role", "supplement"),
|
|
priority=int(value.get("priority", 100)),
|
|
required=bool(value.get("required", False)),
|
|
enabled=bool(value.get("enabled", True)),
|
|
timeout_seconds=int(value.get("timeout_seconds", 25)),
|
|
retries=int(value.get("retries", 0)),
|
|
min_items=int(value.get("min_items", 0)),
|
|
url=value.get("url", ""),
|
|
max_item_age_days=int(max_item_age_days) if max_item_age_days is not None else None,
|
|
failure_policy=str(value.get("failure_policy") or ("block" if bool(value.get("required", False)) else "warn")),
|
|
)
|
|
|
|
|
|
def run_stage0_to_stage2(
|
|
source_configs: list[dict[str, Any] | SourceConfig],
|
|
run_date: str,
|
|
*,
|
|
fetcher: Fetcher,
|
|
) -> dict[str, Any]:
|
|
configs = [
|
|
config if isinstance(config, SourceConfig) else _source_config_from_dict(config)
|
|
for config in source_configs
|
|
]
|
|
source_results, stage0_report = collect_sources(configs, run_date, fetcher=fetcher)
|
|
source_priorities = {config.name: config.priority for config in configs}
|
|
normalized_items, stage1_report = normalize_items(
|
|
source_results,
|
|
run_date=run_date,
|
|
source_priorities=source_priorities,
|
|
)
|
|
deduped_items, stage2_report = hard_dedup_items(normalized_items)
|
|
artifacts = {
|
|
"stage0_sources": source_results,
|
|
"stage1_items": normalized_items,
|
|
"stage2_items": deduped_items,
|
|
}
|
|
return {
|
|
"source_results": source_results,
|
|
"items": deduped_items,
|
|
"reports": {
|
|
"stage0": stage0_report,
|
|
"stage1": stage1_report,
|
|
"stage2": stage2_report,
|
|
},
|
|
"artifacts": artifacts,
|
|
}
|
|
|
|
|
|
def run_stage0_to_stage2_5(
|
|
source_configs: list[dict[str, Any] | SourceConfig],
|
|
run_date: str,
|
|
*,
|
|
fetcher: Fetcher,
|
|
published_urls: PublishedUrls | None = None,
|
|
cross_day_dedup_enabled: bool = True,
|
|
cross_day_dedup_max_age_days: int = 7,
|
|
) -> dict[str, Any]:
|
|
stage2_result = run_stage0_to_stage2(source_configs, run_date, fetcher=fetcher)
|
|
if cross_day_dedup_enabled:
|
|
items, stage2_5_report = cross_day_dedup_items(
|
|
stage2_result["items"],
|
|
published_urls,
|
|
run_date=run_date,
|
|
max_age_days=cross_day_dedup_max_age_days,
|
|
)
|
|
else:
|
|
items = stage2_result["items"]
|
|
stage2_5_report = {
|
|
"input_count": len(items),
|
|
"output_count": len(items),
|
|
"removed_count": 0,
|
|
"removed": [],
|
|
"enabled": False,
|
|
"max_age_days": cross_day_dedup_max_age_days,
|
|
}
|
|
reports = dict(stage2_result["reports"])
|
|
stage2_5_report.setdefault("enabled", cross_day_dedup_enabled)
|
|
reports["stage2_5"] = stage2_5_report
|
|
artifacts = dict(stage2_result.get("artifacts", {}))
|
|
artifacts["stage2_5_items"] = items
|
|
return {
|
|
"source_results": stage2_result["source_results"],
|
|
"items": items,
|
|
"reports": reports,
|
|
"artifacts": artifacts,
|
|
}
|
|
|
|
|
|
def run_stage0_to_stage4(
|
|
source_configs: list[dict[str, Any] | SourceConfig],
|
|
run_date: str,
|
|
*,
|
|
fetcher: Fetcher,
|
|
semantic_llm_call: SemanticLlmCall,
|
|
rewrite_llm_call: RewriteLlmCall,
|
|
published_urls: PublishedUrls | None = None,
|
|
cross_day_dedup_enabled: bool = True,
|
|
cross_day_dedup_max_age_days: int = 7,
|
|
semantic_dedup_max_deletion_ratio: float = 0.5,
|
|
rewrite_batch_size: int = 30,
|
|
semantic_candidate_recall_config: dict[str, Any] | None = None,
|
|
quality_gate_config: dict[str, Any] | None = None,
|
|
) -> dict[str, Any]:
|
|
stage2_5_result = run_stage0_to_stage2_5(
|
|
source_configs,
|
|
run_date,
|
|
fetcher=fetcher,
|
|
published_urls=published_urls,
|
|
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
|
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
|
)
|
|
items = stage2_5_result["items"]
|
|
remaining_ids = {item.id for item in items}
|
|
candidates = [
|
|
candidate
|
|
for candidate in stage2_5_result["reports"]["stage2"].get("possible_duplicates", [])
|
|
if set(candidate.get("item_ids", [])).issubset(remaining_ids)
|
|
]
|
|
candidates, stage2_8_report = recall_semantic_candidates(
|
|
items,
|
|
existing_candidates=candidates,
|
|
config=semantic_candidate_recall_config,
|
|
)
|
|
semantic_items, stage3_report = semantic_dedup_items(
|
|
items,
|
|
candidates,
|
|
llm_call=semantic_llm_call,
|
|
max_deletion_ratio=semantic_dedup_max_deletion_ratio,
|
|
)
|
|
rewritten_items, stage4_report = rewrite_items(
|
|
semantic_items,
|
|
llm_call=rewrite_llm_call,
|
|
batch_size=rewrite_batch_size,
|
|
)
|
|
reports = dict(stage2_5_result["reports"])
|
|
reports["stage2_8"] = stage2_8_report
|
|
reports["stage3"] = stage3_report
|
|
reports["stage4"] = stage4_report
|
|
artifacts = dict(stage2_5_result.get("artifacts", {}))
|
|
artifacts["stage2_8_candidates"] = candidates
|
|
artifacts["stage3_items"] = semantic_items
|
|
artifacts["stage4_items"] = rewritten_items
|
|
return {
|
|
"source_results": stage2_5_result["source_results"],
|
|
"items": rewritten_items,
|
|
"reports": reports,
|
|
"artifacts": artifacts,
|
|
}
|
|
|
|
|
|
def run_stage0_to_stage5(
|
|
source_configs: list[dict[str, Any] | SourceConfig],
|
|
run_date: str,
|
|
*,
|
|
fetcher: Fetcher,
|
|
semantic_llm_call: SemanticLlmCall,
|
|
rewrite_llm_call: RewriteLlmCall,
|
|
published_urls: PublishedUrls | None = None,
|
|
cross_day_dedup_enabled: bool = True,
|
|
cross_day_dedup_max_age_days: int = 7,
|
|
semantic_dedup_max_deletion_ratio: float = 0.5,
|
|
rewrite_batch_size: int = 30,
|
|
semantic_candidate_recall_config: dict[str, Any] | None = None,
|
|
quality_gate_config: dict[str, Any] | None = None,
|
|
) -> dict[str, Any]:
|
|
stage4_result = run_stage0_to_stage4(
|
|
source_configs,
|
|
run_date,
|
|
fetcher=fetcher,
|
|
semantic_llm_call=semantic_llm_call,
|
|
rewrite_llm_call=rewrite_llm_call,
|
|
published_urls=published_urls,
|
|
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
|
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
|
semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio,
|
|
rewrite_batch_size=rewrite_batch_size,
|
|
semantic_candidate_recall_config=semantic_candidate_recall_config,
|
|
)
|
|
classified_items, stage5_report = classify_and_order_items(stage4_result["items"])
|
|
reports = dict(stage4_result["reports"])
|
|
reports["stage5"] = stage5_report
|
|
return {
|
|
"source_results": stage4_result["source_results"],
|
|
"items": classified_items,
|
|
"reports": reports,
|
|
"artifacts": stage4_result.get("artifacts", {}),
|
|
}
|
|
|
|
|
|
def run_stage0_to_stage6(
|
|
source_configs: list[dict[str, Any] | SourceConfig],
|
|
run_date: str,
|
|
*,
|
|
fetcher: Fetcher,
|
|
semantic_llm_call: SemanticLlmCall,
|
|
rewrite_llm_call: RewriteLlmCall,
|
|
guide_llm_call: GuideLlmCall,
|
|
published_urls: PublishedUrls | None = None,
|
|
cross_day_dedup_enabled: bool = True,
|
|
cross_day_dedup_max_age_days: int = 7,
|
|
semantic_dedup_max_deletion_ratio: float = 0.5,
|
|
rewrite_batch_size: int = 30,
|
|
semantic_candidate_recall_config: dict[str, Any] | None = None,
|
|
) -> dict[str, Any]:
|
|
stage5_result = run_stage0_to_stage5(
|
|
source_configs,
|
|
run_date,
|
|
fetcher=fetcher,
|
|
semantic_llm_call=semantic_llm_call,
|
|
rewrite_llm_call=rewrite_llm_call,
|
|
published_urls=published_urls,
|
|
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
|
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
|
semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio,
|
|
rewrite_batch_size=rewrite_batch_size,
|
|
semantic_candidate_recall_config=semantic_candidate_recall_config,
|
|
)
|
|
guide, stage6_report = generate_guide(stage5_result["items"], llm_call=guide_llm_call)
|
|
reports = dict(stage5_result["reports"])
|
|
reports["stage6"] = stage6_report
|
|
return {
|
|
"source_results": stage5_result["source_results"],
|
|
"items": stage5_result["items"],
|
|
"guide": guide,
|
|
"reports": reports,
|
|
"artifacts": stage5_result.get("artifacts", {}),
|
|
}
|
|
|
|
|
|
def run_stage0_to_stage7(
|
|
source_configs: list[dict[str, Any] | SourceConfig],
|
|
run_date: str,
|
|
*,
|
|
fetcher: Fetcher,
|
|
semantic_llm_call: SemanticLlmCall,
|
|
rewrite_llm_call: RewriteLlmCall,
|
|
guide_llm_call: GuideLlmCall,
|
|
published_urls: PublishedUrls | None = None,
|
|
cross_day_dedup_enabled: bool = True,
|
|
cross_day_dedup_max_age_days: int = 7,
|
|
semantic_dedup_max_deletion_ratio: float = 0.5,
|
|
rewrite_batch_size: int = 30,
|
|
semantic_candidate_recall_config: dict[str, Any] | None = None,
|
|
quality_gate_config: dict[str, Any] | None = None,
|
|
) -> dict[str, Any]:
|
|
stage6_result = run_stage0_to_stage6(
|
|
source_configs,
|
|
run_date,
|
|
fetcher=fetcher,
|
|
semantic_llm_call=semantic_llm_call,
|
|
rewrite_llm_call=rewrite_llm_call,
|
|
guide_llm_call=guide_llm_call,
|
|
published_urls=published_urls,
|
|
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
|
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
|
semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio,
|
|
rewrite_batch_size=rewrite_batch_size,
|
|
semantic_candidate_recall_config=semantic_candidate_recall_config,
|
|
)
|
|
markdown, stage7_report = assemble_markdown(stage6_result["items"], stage6_result["guide"])
|
|
upstream_blocking_errors: list[str] = []
|
|
for stage_name in ("stage3", "stage4", "stage5", "stage6"):
|
|
for error in stage6_result["reports"].get(stage_name, {}).get("blocking_errors", []) or []:
|
|
upstream_blocking_errors.append(str(error))
|
|
if upstream_blocking_errors:
|
|
existing_errors = list(stage7_report.get("blocking_errors", []) or [])
|
|
stage7_report["blocking_errors"] = existing_errors + upstream_blocking_errors
|
|
reports = dict(stage6_result["reports"])
|
|
quality_gate_report = evaluate_quality_gate(
|
|
stage6_result["items"],
|
|
source_results=stage6_result["source_results"],
|
|
reports=reports,
|
|
config=quality_gate_config,
|
|
)
|
|
if quality_gate_report.get("blocking_errors"):
|
|
existing_errors = list(stage7_report.get("blocking_errors", []) or [])
|
|
stage7_report["blocking_errors"] = existing_errors + list(quality_gate_report["blocking_errors"])
|
|
reports["quality_gate"] = quality_gate_report
|
|
reports["stage7"] = stage7_report
|
|
artifacts = dict(stage6_result.get("artifacts", {}))
|
|
artifacts["quality_gate"] = quality_gate_report
|
|
return {
|
|
"source_results": stage6_result["source_results"],
|
|
"items": stage6_result["items"],
|
|
"guide": stage6_result["guide"],
|
|
"markdown": markdown,
|
|
"reports": reports,
|
|
"artifacts": artifacts,
|
|
}
|
|
|
|
|
|
def run_stage0_to_stage8(
|
|
source_configs: list[dict[str, Any] | SourceConfig],
|
|
run_date: str,
|
|
*,
|
|
fetcher: Fetcher,
|
|
semantic_llm_call: SemanticLlmCall,
|
|
rewrite_llm_call: RewriteLlmCall,
|
|
guide_llm_call: GuideLlmCall,
|
|
mode: str,
|
|
base_url: str,
|
|
client: BlogClient | None,
|
|
published_urls: PublishedUrls | None = None,
|
|
cross_day_dedup_enabled: bool = True,
|
|
cross_day_dedup_max_age_days: int = 7,
|
|
semantic_dedup_max_deletion_ratio: float = 0.5,
|
|
rewrite_batch_size: int = 30,
|
|
semantic_candidate_recall_config: dict[str, Any] | None = None,
|
|
quality_gate_config: dict[str, Any] | None = None,
|
|
publish_idempotency_config: dict[str, Any] | None = None,
|
|
) -> dict[str, Any]:
|
|
stage7_result = run_stage0_to_stage7(
|
|
source_configs,
|
|
run_date,
|
|
fetcher=fetcher,
|
|
semantic_llm_call=semantic_llm_call,
|
|
rewrite_llm_call=rewrite_llm_call,
|
|
guide_llm_call=guide_llm_call,
|
|
published_urls=published_urls,
|
|
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
|
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
|
semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio,
|
|
rewrite_batch_size=rewrite_batch_size,
|
|
semantic_candidate_recall_config=semantic_candidate_recall_config,
|
|
quality_gate_config=quality_gate_config,
|
|
)
|
|
slug = f"ai-{run_date}"
|
|
effective_mode = mode
|
|
quality_gate_report = stage7_result["reports"].get("quality_gate", {}) or {}
|
|
required_policy = str(quality_gate_report.get("required_source_failure_policy") or "block")
|
|
if quality_gate_report.get("required_source_failures") and required_policy in {"draft", "dry_run"}:
|
|
effective_mode = "dry-run" if required_policy == "dry_run" else "draft"
|
|
|
|
publish_result = publish_markdown(
|
|
title=f"AI日报 · {run_date}",
|
|
markdown=stage7_result["markdown"],
|
|
tags=["AI日报", "AI资讯", "人工智能"],
|
|
slug=slug,
|
|
base_url=base_url,
|
|
mode=effective_mode,
|
|
markdown_report=stage7_result["reports"]["stage7"],
|
|
client=client,
|
|
idempotency_config=publish_idempotency_config,
|
|
)
|
|
reports = dict(stage7_result["reports"])
|
|
reports["stage8"] = {
|
|
"requested_mode": mode,
|
|
"mode": publish_result.mode,
|
|
"status": publish_result.status,
|
|
"slug": publish_result.slug,
|
|
"blog_url": publish_result.blog_url,
|
|
"public_ok": publish_result.public_ok,
|
|
"error": publish_result.error,
|
|
}
|
|
return {
|
|
"source_results": stage7_result["source_results"],
|
|
"items": stage7_result["items"],
|
|
"guide": stage7_result["guide"],
|
|
"markdown": stage7_result["markdown"],
|
|
"publish": publish_result,
|
|
"reports": reports,
|
|
"artifacts": stage7_result.get("artifacts", {}),
|
|
}
|