99 lines
3.6 KiB
Python
99 lines
3.6 KiB
Python
from __future__ import annotations
|
|
|
|
import difflib
|
|
from typing import Any
|
|
|
|
from .dedupe import _title_tokens
|
|
from .models import NewsItem, SourceResult
|
|
|
|
|
|
DEFAULT_CONFIG = {
|
|
"required_source_failure_policy": "block", # block | draft | dry_run | warn
|
|
"block_on_required_source_failure": True,
|
|
"warn_on_enabled_source_failure": True,
|
|
"warn_when_stage3_candidates_zero_min_items": 30,
|
|
"warn_on_final_title_similarity": 0.55,
|
|
"warn_on_entity_frequency": 3,
|
|
"required_sources": [],
|
|
}
|
|
|
|
|
|
def _config(config: dict[str, Any] | None) -> dict[str, Any]:
|
|
return {**DEFAULT_CONFIG, **(config or {})}
|
|
|
|
|
|
def _source_failures(source_results: list[SourceResult]) -> list[dict[str, Any]]:
|
|
failures: list[dict[str, Any]] = []
|
|
for result in source_results:
|
|
if result.ok or result.status == "disabled":
|
|
continue
|
|
failures.append(
|
|
{
|
|
"source": result.source,
|
|
"role": result.role,
|
|
"status": result.status,
|
|
"error": result.error,
|
|
}
|
|
)
|
|
return failures
|
|
|
|
|
|
def _similar_title_warnings(items: list[NewsItem], threshold: float) -> list[str]:
|
|
warnings: list[str] = []
|
|
for index, left in enumerate(items):
|
|
left_title = left.title or left.title_raw
|
|
for right in items[index + 1 :]:
|
|
right_title = right.title or right.title_raw
|
|
if len(_title_tokens(left_title)) < 2 or len(_title_tokens(right_title)) < 2:
|
|
continue
|
|
ratio = difflib.SequenceMatcher(None, left_title.lower(), right_title.lower()).ratio()
|
|
if ratio >= threshold:
|
|
warnings.append(f"final_title_similarity:{left.id}:{right.id}:{ratio:.3f}")
|
|
return warnings
|
|
|
|
|
|
def evaluate_quality_gate(
|
|
items: list[NewsItem],
|
|
*,
|
|
source_results: list[SourceResult],
|
|
reports: dict[str, Any],
|
|
config: dict[str, Any] | None = None,
|
|
) -> dict[str, Any]:
|
|
config = _config(config)
|
|
warnings: list[str] = []
|
|
blocking_errors: list[str] = []
|
|
|
|
stage3_report = reports.get("stage3", {}) or {}
|
|
min_items = int(config["warn_when_stage3_candidates_zero_min_items"])
|
|
if len(items) > min_items and int(stage3_report.get("candidate_group_count", 0)) == 0:
|
|
warnings.append("stage3_candidates_zero")
|
|
|
|
failures = _source_failures(source_results)
|
|
if bool(config["warn_on_enabled_source_failure"]):
|
|
for failure in failures:
|
|
warnings.append(f"enabled_source_failed:{failure['source']}:{failure['status']}")
|
|
|
|
required_sources = set(config.get("required_sources") or [])
|
|
required_failures = [failure for failure in failures if failure["source"] in required_sources]
|
|
policy = str(config.get("required_source_failure_policy") or "block")
|
|
if bool(config["block_on_required_source_failure"]) and policy == "block":
|
|
for failure in required_failures:
|
|
blocking_errors.append(f"required_source_failed:{failure['source']}:{failure['status']}")
|
|
elif required_failures:
|
|
for failure in required_failures:
|
|
warnings.append(f"required_source_failed:{failure['source']}:{failure['status']}:{policy}")
|
|
|
|
title_threshold = float(config["warn_on_final_title_similarity"])
|
|
if title_threshold > 0:
|
|
warnings.extend(_similar_title_warnings(items, title_threshold))
|
|
|
|
return {
|
|
"input_count": len(items),
|
|
"warnings": warnings,
|
|
"blocking_errors": blocking_errors,
|
|
"source_failures": failures,
|
|
"required_source_failures": required_failures,
|
|
"required_source_failure_policy": policy,
|
|
"quality_gate_failed": bool(blocking_errors),
|
|
}
|