Add Stage 2.8 recall, quality gate, retries, and publish idempotency
This commit is contained in:
@@ -3,6 +3,7 @@ from __future__ import annotations
|
||||
from typing import Any
|
||||
|
||||
from .assemble import assemble_markdown
|
||||
from .candidate_recall import recall_semantic_candidates
|
||||
from .classify import classify_and_order_items
|
||||
from .collect import Fetcher, collect_sources
|
||||
from .dedupe import cross_day_dedup_items, hard_dedup_items
|
||||
@@ -10,6 +11,7 @@ from .guide import GuideLlmCall, generate_guide
|
||||
from .models import PublishedUrls, SourceConfig
|
||||
from .normalize import normalize_items
|
||||
from .publish import BlogClient, publish_markdown
|
||||
from .quality_gate import evaluate_quality_gate
|
||||
from .rewrite import RewriteLlmCall, rewrite_items
|
||||
from .semantic_dedupe import SemanticLlmCall, semantic_dedup_items
|
||||
|
||||
@@ -49,6 +51,11 @@ def run_stage0_to_stage2(
|
||||
source_priorities=source_priorities,
|
||||
)
|
||||
deduped_items, stage2_report = hard_dedup_items(normalized_items)
|
||||
artifacts = {
|
||||
"stage0_sources": source_results,
|
||||
"stage1_items": normalized_items,
|
||||
"stage2_items": deduped_items,
|
||||
}
|
||||
return {
|
||||
"source_results": source_results,
|
||||
"items": deduped_items,
|
||||
@@ -57,6 +64,7 @@ def run_stage0_to_stage2(
|
||||
"stage1": stage1_report,
|
||||
"stage2": stage2_report,
|
||||
},
|
||||
"artifacts": artifacts,
|
||||
}
|
||||
|
||||
|
||||
@@ -90,10 +98,13 @@ def run_stage0_to_stage2_5(
|
||||
reports = dict(stage2_result["reports"])
|
||||
stage2_5_report.setdefault("enabled", cross_day_dedup_enabled)
|
||||
reports["stage2_5"] = stage2_5_report
|
||||
artifacts = dict(stage2_result.get("artifacts", {}))
|
||||
artifacts["stage2_5_items"] = items
|
||||
return {
|
||||
"source_results": stage2_result["source_results"],
|
||||
"items": items,
|
||||
"reports": reports,
|
||||
"artifacts": artifacts,
|
||||
}
|
||||
|
||||
|
||||
@@ -107,6 +118,10 @@ def run_stage0_to_stage4(
|
||||
published_urls: PublishedUrls | None = None,
|
||||
cross_day_dedup_enabled: bool = True,
|
||||
cross_day_dedup_max_age_days: int = 7,
|
||||
semantic_dedup_max_deletion_ratio: float = 0.5,
|
||||
rewrite_batch_size: int = 30,
|
||||
semantic_candidate_recall_config: dict[str, Any] | None = None,
|
||||
quality_gate_config: dict[str, Any] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
stage2_5_result = run_stage0_to_stage2_5(
|
||||
source_configs,
|
||||
@@ -123,22 +138,35 @@ def run_stage0_to_stage4(
|
||||
for candidate in stage2_5_result["reports"]["stage2"].get("possible_duplicates", [])
|
||||
if set(candidate.get("item_ids", [])).issubset(remaining_ids)
|
||||
]
|
||||
candidates, stage2_8_report = recall_semantic_candidates(
|
||||
items,
|
||||
existing_candidates=candidates,
|
||||
config=semantic_candidate_recall_config,
|
||||
)
|
||||
semantic_items, stage3_report = semantic_dedup_items(
|
||||
items,
|
||||
candidates,
|
||||
llm_call=semantic_llm_call,
|
||||
max_deletion_ratio=semantic_dedup_max_deletion_ratio,
|
||||
)
|
||||
rewritten_items, stage4_report = rewrite_items(
|
||||
semantic_items,
|
||||
llm_call=rewrite_llm_call,
|
||||
batch_size=rewrite_batch_size,
|
||||
)
|
||||
reports = dict(stage2_5_result["reports"])
|
||||
reports["stage2_8"] = stage2_8_report
|
||||
reports["stage3"] = stage3_report
|
||||
reports["stage4"] = stage4_report
|
||||
artifacts = dict(stage2_5_result.get("artifacts", {}))
|
||||
artifacts["stage2_8_candidates"] = candidates
|
||||
artifacts["stage3_items"] = semantic_items
|
||||
artifacts["stage4_items"] = rewritten_items
|
||||
return {
|
||||
"source_results": stage2_5_result["source_results"],
|
||||
"items": rewritten_items,
|
||||
"reports": reports,
|
||||
"artifacts": artifacts,
|
||||
}
|
||||
|
||||
|
||||
@@ -152,6 +180,10 @@ def run_stage0_to_stage5(
|
||||
published_urls: PublishedUrls | None = None,
|
||||
cross_day_dedup_enabled: bool = True,
|
||||
cross_day_dedup_max_age_days: int = 7,
|
||||
semantic_dedup_max_deletion_ratio: float = 0.5,
|
||||
rewrite_batch_size: int = 30,
|
||||
semantic_candidate_recall_config: dict[str, Any] | None = None,
|
||||
quality_gate_config: dict[str, Any] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
stage4_result = run_stage0_to_stage4(
|
||||
source_configs,
|
||||
@@ -162,6 +194,9 @@ def run_stage0_to_stage5(
|
||||
published_urls=published_urls,
|
||||
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
||||
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
||||
semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio,
|
||||
rewrite_batch_size=rewrite_batch_size,
|
||||
semantic_candidate_recall_config=semantic_candidate_recall_config,
|
||||
)
|
||||
classified_items, stage5_report = classify_and_order_items(stage4_result["items"])
|
||||
reports = dict(stage4_result["reports"])
|
||||
@@ -170,6 +205,7 @@ def run_stage0_to_stage5(
|
||||
"source_results": stage4_result["source_results"],
|
||||
"items": classified_items,
|
||||
"reports": reports,
|
||||
"artifacts": stage4_result.get("artifacts", {}),
|
||||
}
|
||||
|
||||
|
||||
@@ -184,6 +220,9 @@ def run_stage0_to_stage6(
|
||||
published_urls: PublishedUrls | None = None,
|
||||
cross_day_dedup_enabled: bool = True,
|
||||
cross_day_dedup_max_age_days: int = 7,
|
||||
semantic_dedup_max_deletion_ratio: float = 0.5,
|
||||
rewrite_batch_size: int = 30,
|
||||
semantic_candidate_recall_config: dict[str, Any] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
stage5_result = run_stage0_to_stage5(
|
||||
source_configs,
|
||||
@@ -194,6 +233,9 @@ def run_stage0_to_stage6(
|
||||
published_urls=published_urls,
|
||||
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
||||
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
||||
semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio,
|
||||
rewrite_batch_size=rewrite_batch_size,
|
||||
semantic_candidate_recall_config=semantic_candidate_recall_config,
|
||||
)
|
||||
guide, stage6_report = generate_guide(stage5_result["items"], llm_call=guide_llm_call)
|
||||
reports = dict(stage5_result["reports"])
|
||||
@@ -203,6 +245,7 @@ def run_stage0_to_stage6(
|
||||
"items": stage5_result["items"],
|
||||
"guide": guide,
|
||||
"reports": reports,
|
||||
"artifacts": stage5_result.get("artifacts", {}),
|
||||
}
|
||||
|
||||
|
||||
@@ -217,6 +260,10 @@ def run_stage0_to_stage7(
|
||||
published_urls: PublishedUrls | None = None,
|
||||
cross_day_dedup_enabled: bool = True,
|
||||
cross_day_dedup_max_age_days: int = 7,
|
||||
semantic_dedup_max_deletion_ratio: float = 0.5,
|
||||
rewrite_batch_size: int = 30,
|
||||
semantic_candidate_recall_config: dict[str, Any] | None = None,
|
||||
quality_gate_config: dict[str, Any] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
stage6_result = run_stage0_to_stage6(
|
||||
source_configs,
|
||||
@@ -228,6 +275,9 @@ def run_stage0_to_stage7(
|
||||
published_urls=published_urls,
|
||||
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
||||
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
||||
semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio,
|
||||
rewrite_batch_size=rewrite_batch_size,
|
||||
semantic_candidate_recall_config=semantic_candidate_recall_config,
|
||||
)
|
||||
markdown, stage7_report = assemble_markdown(stage6_result["items"], stage6_result["guide"])
|
||||
upstream_blocking_errors: list[str] = []
|
||||
@@ -238,13 +288,26 @@ def run_stage0_to_stage7(
|
||||
existing_errors = list(stage7_report.get("blocking_errors", []) or [])
|
||||
stage7_report["blocking_errors"] = existing_errors + upstream_blocking_errors
|
||||
reports = dict(stage6_result["reports"])
|
||||
quality_gate_report = evaluate_quality_gate(
|
||||
stage6_result["items"],
|
||||
source_results=stage6_result["source_results"],
|
||||
reports=reports,
|
||||
config=quality_gate_config,
|
||||
)
|
||||
if quality_gate_report.get("blocking_errors"):
|
||||
existing_errors = list(stage7_report.get("blocking_errors", []) or [])
|
||||
stage7_report["blocking_errors"] = existing_errors + list(quality_gate_report["blocking_errors"])
|
||||
reports["quality_gate"] = quality_gate_report
|
||||
reports["stage7"] = stage7_report
|
||||
artifacts = dict(stage6_result.get("artifacts", {}))
|
||||
artifacts["quality_gate"] = quality_gate_report
|
||||
return {
|
||||
"source_results": stage6_result["source_results"],
|
||||
"items": stage6_result["items"],
|
||||
"guide": stage6_result["guide"],
|
||||
"markdown": markdown,
|
||||
"reports": reports,
|
||||
"artifacts": artifacts,
|
||||
}
|
||||
|
||||
|
||||
@@ -262,6 +325,11 @@ def run_stage0_to_stage8(
|
||||
published_urls: PublishedUrls | None = None,
|
||||
cross_day_dedup_enabled: bool = True,
|
||||
cross_day_dedup_max_age_days: int = 7,
|
||||
semantic_dedup_max_deletion_ratio: float = 0.5,
|
||||
rewrite_batch_size: int = 30,
|
||||
semantic_candidate_recall_config: dict[str, Any] | None = None,
|
||||
quality_gate_config: dict[str, Any] | None = None,
|
||||
publish_idempotency_config: dict[str, Any] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
stage7_result = run_stage0_to_stage7(
|
||||
source_configs,
|
||||
@@ -273,6 +341,10 @@ def run_stage0_to_stage8(
|
||||
published_urls=published_urls,
|
||||
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
||||
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
||||
semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio,
|
||||
rewrite_batch_size=rewrite_batch_size,
|
||||
semantic_candidate_recall_config=semantic_candidate_recall_config,
|
||||
quality_gate_config=quality_gate_config,
|
||||
)
|
||||
slug = f"ai-{run_date}"
|
||||
publish_result = publish_markdown(
|
||||
@@ -284,6 +356,7 @@ def run_stage0_to_stage8(
|
||||
mode=mode,
|
||||
markdown_report=stage7_result["reports"]["stage7"],
|
||||
client=client,
|
||||
idempotency_config=publish_idempotency_config,
|
||||
)
|
||||
reports = dict(stage7_result["reports"])
|
||||
reports["stage8"] = {
|
||||
@@ -301,4 +374,5 @@ def run_stage0_to_stage8(
|
||||
"markdown": stage7_result["markdown"],
|
||||
"publish": publish_result,
|
||||
"reports": reports,
|
||||
"artifacts": stage7_result.get("artifacts", {}),
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user