Add Stage 2.8 recall, quality gate, retries, and publish idempotency

This commit is contained in:
Mimikko-zeus
2026-06-10 21:31:13 +08:00
parent 07786e3bc0
commit b46cef2c7b
16 changed files with 1253 additions and 6 deletions

View File

@@ -0,0 +1,91 @@
from __future__ import annotations
import difflib
from typing import Any
from .dedupe import _title_tokens
from .models import NewsItem, SourceResult
DEFAULT_CONFIG = {
"block_on_required_source_failure": True,
"warn_on_enabled_source_failure": True,
"warn_when_stage3_candidates_zero_min_items": 30,
"warn_on_final_title_similarity": 0.55,
"warn_on_entity_frequency": 3,
"required_sources": [],
}
def _config(config: dict[str, Any] | None) -> dict[str, Any]:
return {**DEFAULT_CONFIG, **(config or {})}
def _source_failures(source_results: list[SourceResult]) -> list[dict[str, Any]]:
failures: list[dict[str, Any]] = []
for result in source_results:
if result.ok or result.status == "disabled":
continue
failures.append(
{
"source": result.source,
"role": result.role,
"status": result.status,
"error": result.error,
}
)
return failures
def _similar_title_warnings(items: list[NewsItem], threshold: float) -> list[str]:
warnings: list[str] = []
for index, left in enumerate(items):
left_title = left.title or left.title_raw
for right in items[index + 1 :]:
right_title = right.title or right.title_raw
if len(_title_tokens(left_title)) < 2 or len(_title_tokens(right_title)) < 2:
continue
ratio = difflib.SequenceMatcher(None, left_title.lower(), right_title.lower()).ratio()
if ratio >= threshold:
warnings.append(f"final_title_similarity:{left.id}:{right.id}:{ratio:.3f}")
return warnings
def evaluate_quality_gate(
items: list[NewsItem],
*,
source_results: list[SourceResult],
reports: dict[str, Any],
config: dict[str, Any] | None = None,
) -> dict[str, Any]:
config = _config(config)
warnings: list[str] = []
blocking_errors: list[str] = []
stage3_report = reports.get("stage3", {}) or {}
min_items = int(config["warn_when_stage3_candidates_zero_min_items"])
if len(items) > min_items and int(stage3_report.get("candidate_group_count", 0)) == 0:
warnings.append("stage3_candidates_zero")
failures = _source_failures(source_results)
if bool(config["warn_on_enabled_source_failure"]):
for failure in failures:
warnings.append(f"enabled_source_failed:{failure['source']}:{failure['status']}")
required_sources = set(config.get("required_sources") or [])
if bool(config["block_on_required_source_failure"]):
for failure in failures:
if failure["source"] in required_sources:
blocking_errors.append(f"required_source_failed:{failure['source']}:{failure['status']}")
title_threshold = float(config["warn_on_final_title_similarity"])
if title_threshold > 0:
warnings.extend(_similar_title_warnings(items, title_threshold))
return {
"input_count": len(items),
"warnings": warnings,
"blocking_errors": blocking_errors,
"source_failures": failures,
"quality_gate_failed": bool(blocking_errors),
}