Add Stage 2.8 recall, quality gate, retries, and publish idempotency

2026-06-10 21:31:13 +08:00
parent 07786e3bc0
commit b46cef2c7b
16 changed files with 1253 additions and 6 deletions
--- a/ai_daily_report/candidate_recall.py
+++ b/ai_daily_report/candidate_recall.py
@@ -0,0 +1,162 @@
+from __future__ import annotations
+
+import difflib
+import re
+from collections import defaultdict
+from typing import Any
+
+from .dedupe import _jaccard_similarity, _title_tokens
+from .models import NewsItem
+
+
+DEFAULT_CONFIG = {
+    "enabled": True,
+    "max_pairs": 80,
+    "max_pairs_per_item": 5,
+    "title_similarity_threshold": 0.45,
+    "title_jaccard_threshold": 0.25,
+    "summary_jaccard_threshold": 0.18,
+    "strong_entity_overlap_threshold": 2,
+}
+
+STOP_ENTITIES = {
+    "AI",
+    "API",
+    "CLI",
+    "LLM",
+    "Open Source",
+    "GitHub",
+    "Google",
+    "OpenAI",
+    "Anthropic",
+    "Microsoft",
+    "Meta",
+    "Amazon",
+    "NVIDIA",
+}
+
+
+def _config_value(config: dict[str, Any], name: str):
+    return (config or {}).get(name, DEFAULT_CONFIG[name])
+
+
+def _text_tokens(value: str) -> set[str]:
+    return _title_tokens(value)
+
+
+def _entity_tokens(value: str) -> set[str]:
+    text = value or ""
+    entities = set(re.findall(r"\b[A-Z][A-Za-z0-9]*(?:[- ][A-Z0-9][A-Za-z0-9]*)*\b", text))
+    entities.update(re.findall(r"[\u4e00-\u9fffA-Za-z0-9]*[A-Za-z]+[0-9]+[A-Za-z0-9-]*", text))
+    cleaned = {entity.strip() for entity in entities if len(entity.strip()) >= 3}
+    return {entity for entity in cleaned if entity not in STOP_ENTITIES}
+
+
+def _pair_key(item_ids: list[str]) -> frozenset[str]:
+    return frozenset(item_ids)
+
+
+def _candidate_score(left: NewsItem, right: NewsItem, config: dict[str, Any]) -> tuple[float, str, dict[str, Any]] | None:
+    title_ratio = difflib.SequenceMatcher(None, left.title_norm, right.title_norm).ratio()
+    title_jaccard = _jaccard_similarity(_text_tokens(left.title_norm), _text_tokens(right.title_norm))
+    summary_jaccard = _jaccard_similarity(_text_tokens(left.summary_raw), _text_tokens(right.summary_raw))
+    left_entities = _entity_tokens(f"{left.title_raw} {left.summary_raw}")
+    right_entities = _entity_tokens(f"{right.title_raw} {right.summary_raw}")
+    shared_entities = sorted(left_entities & right_entities)
+    strong_entity_threshold = int(_config_value(config, "strong_entity_overlap_threshold"))
+
+    if len(shared_entities) >= strong_entity_threshold and summary_jaccard > 0:
+        score = min(1.0, 0.55 + len(shared_entities) * 0.1 + summary_jaccard * 0.35)
+        return score, "strong_entity_overlap", {
+            "shared_entities": shared_entities,
+            "title_similarity": round(title_ratio, 3),
+            "title_jaccard": round(title_jaccard, 3),
+            "summary_jaccard": round(summary_jaccard, 3),
+        }
+
+    if title_ratio >= float(_config_value(config, "title_similarity_threshold")) and (
+        title_jaccard >= float(_config_value(config, "title_jaccard_threshold"))
+        or summary_jaccard >= float(_config_value(config, "summary_jaccard_threshold")) * 2
+        or shared_entities
+    ):
+        return title_ratio, "title_similarity", {
+            "title_similarity": round(title_ratio, 3),
+            "title_jaccard": round(title_jaccard, 3),
+            "summary_jaccard": round(summary_jaccard, 3),
+        }
+
+    if (
+        title_jaccard >= float(_config_value(config, "title_jaccard_threshold"))
+        and summary_jaccard >= float(_config_value(config, "summary_jaccard_threshold"))
+    ):
+        score = (title_jaccard + summary_jaccard) / 2
+        return score, "title_summary_jaccard", {
+            "title_similarity": round(title_ratio, 3),
+            "title_jaccard": round(title_jaccard, 3),
+            "summary_jaccard": round(summary_jaccard, 3),
+        }
+
+    return None
+
+
+def recall_semantic_candidates(
+    items: list[NewsItem],
+    *,
+    existing_candidates: list[dict[str, Any]] | None = None,
+    config: dict[str, Any] | None = None,
+) -> tuple[list[dict[str, Any]], dict[str, Any]]:
+    config = {**DEFAULT_CONFIG, **(config or {})}
+    existing_candidates = list(existing_candidates or [])
+    if not bool(config.get("enabled", True)):
+        return existing_candidates, {
+            "enabled": False,
+            "input_count": len(items),
+            "existing_candidate_group_count": len(existing_candidates),
+            "added_candidate_group_count": 0,
+            "candidate_group_count": len(existing_candidates),
+            "candidates": existing_candidates,
+        }
+
+    existing_keys = {_pair_key(list(candidate.get("item_ids", []) or [])) for candidate in existing_candidates}
+    pair_counts: defaultdict[str, int] = defaultdict(int)
+    recalled: list[dict[str, Any]] = []
+
+    for index, left in enumerate(items):
+        for right in items[index + 1 :]:
+            if pair_counts[left.id] >= int(config["max_pairs_per_item"]):
+                continue
+            if pair_counts[right.id] >= int(config["max_pairs_per_item"]):
+                continue
+            key = frozenset({left.id, right.id})
+            if key in existing_keys:
+                continue
+            scored = _candidate_score(left, right, config)
+            if scored is None:
+                continue
+            score, reason, evidence = scored
+            recalled.append(
+                {
+                    "item_ids": [left.id, right.id],
+                    "reason": reason,
+                    "score": round(score, 3),
+                    "confidence": "medium",
+                    **evidence,
+                }
+            )
+            pair_counts[left.id] += 1
+            pair_counts[right.id] += 1
+            if len(recalled) >= int(config["max_pairs"]):
+                break
+        if len(recalled) >= int(config["max_pairs"]):
+            break
+
+    candidates = existing_candidates + recalled
+    report = {
+        "enabled": True,
+        "input_count": len(items),
+        "existing_candidate_group_count": len(existing_candidates),
+        "added_candidate_group_count": len(recalled),
+        "candidate_group_count": len(candidates),
+        "candidates": candidates,
+    }
+    return candidates, report