Add Stage 2.8 recall, quality gate, retries, and publish idempotency
This commit is contained in:
162
ai_daily_report/candidate_recall.py
Normal file
162
ai_daily_report/candidate_recall.py
Normal file
@@ -0,0 +1,162 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import difflib
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from typing import Any
|
||||
|
||||
from .dedupe import _jaccard_similarity, _title_tokens
|
||||
from .models import NewsItem
|
||||
|
||||
|
||||
DEFAULT_CONFIG = {
|
||||
"enabled": True,
|
||||
"max_pairs": 80,
|
||||
"max_pairs_per_item": 5,
|
||||
"title_similarity_threshold": 0.45,
|
||||
"title_jaccard_threshold": 0.25,
|
||||
"summary_jaccard_threshold": 0.18,
|
||||
"strong_entity_overlap_threshold": 2,
|
||||
}
|
||||
|
||||
STOP_ENTITIES = {
|
||||
"AI",
|
||||
"API",
|
||||
"CLI",
|
||||
"LLM",
|
||||
"Open Source",
|
||||
"GitHub",
|
||||
"Google",
|
||||
"OpenAI",
|
||||
"Anthropic",
|
||||
"Microsoft",
|
||||
"Meta",
|
||||
"Amazon",
|
||||
"NVIDIA",
|
||||
}
|
||||
|
||||
|
||||
def _config_value(config: dict[str, Any], name: str):
|
||||
return (config or {}).get(name, DEFAULT_CONFIG[name])
|
||||
|
||||
|
||||
def _text_tokens(value: str) -> set[str]:
|
||||
return _title_tokens(value)
|
||||
|
||||
|
||||
def _entity_tokens(value: str) -> set[str]:
|
||||
text = value or ""
|
||||
entities = set(re.findall(r"\b[A-Z][A-Za-z0-9]*(?:[- ][A-Z0-9][A-Za-z0-9]*)*\b", text))
|
||||
entities.update(re.findall(r"[\u4e00-\u9fffA-Za-z0-9]*[A-Za-z]+[0-9]+[A-Za-z0-9-]*", text))
|
||||
cleaned = {entity.strip() for entity in entities if len(entity.strip()) >= 3}
|
||||
return {entity for entity in cleaned if entity not in STOP_ENTITIES}
|
||||
|
||||
|
||||
def _pair_key(item_ids: list[str]) -> frozenset[str]:
|
||||
return frozenset(item_ids)
|
||||
|
||||
|
||||
def _candidate_score(left: NewsItem, right: NewsItem, config: dict[str, Any]) -> tuple[float, str, dict[str, Any]] | None:
|
||||
title_ratio = difflib.SequenceMatcher(None, left.title_norm, right.title_norm).ratio()
|
||||
title_jaccard = _jaccard_similarity(_text_tokens(left.title_norm), _text_tokens(right.title_norm))
|
||||
summary_jaccard = _jaccard_similarity(_text_tokens(left.summary_raw), _text_tokens(right.summary_raw))
|
||||
left_entities = _entity_tokens(f"{left.title_raw} {left.summary_raw}")
|
||||
right_entities = _entity_tokens(f"{right.title_raw} {right.summary_raw}")
|
||||
shared_entities = sorted(left_entities & right_entities)
|
||||
strong_entity_threshold = int(_config_value(config, "strong_entity_overlap_threshold"))
|
||||
|
||||
if len(shared_entities) >= strong_entity_threshold and summary_jaccard > 0:
|
||||
score = min(1.0, 0.55 + len(shared_entities) * 0.1 + summary_jaccard * 0.35)
|
||||
return score, "strong_entity_overlap", {
|
||||
"shared_entities": shared_entities,
|
||||
"title_similarity": round(title_ratio, 3),
|
||||
"title_jaccard": round(title_jaccard, 3),
|
||||
"summary_jaccard": round(summary_jaccard, 3),
|
||||
}
|
||||
|
||||
if title_ratio >= float(_config_value(config, "title_similarity_threshold")) and (
|
||||
title_jaccard >= float(_config_value(config, "title_jaccard_threshold"))
|
||||
or summary_jaccard >= float(_config_value(config, "summary_jaccard_threshold")) * 2
|
||||
or shared_entities
|
||||
):
|
||||
return title_ratio, "title_similarity", {
|
||||
"title_similarity": round(title_ratio, 3),
|
||||
"title_jaccard": round(title_jaccard, 3),
|
||||
"summary_jaccard": round(summary_jaccard, 3),
|
||||
}
|
||||
|
||||
if (
|
||||
title_jaccard >= float(_config_value(config, "title_jaccard_threshold"))
|
||||
and summary_jaccard >= float(_config_value(config, "summary_jaccard_threshold"))
|
||||
):
|
||||
score = (title_jaccard + summary_jaccard) / 2
|
||||
return score, "title_summary_jaccard", {
|
||||
"title_similarity": round(title_ratio, 3),
|
||||
"title_jaccard": round(title_jaccard, 3),
|
||||
"summary_jaccard": round(summary_jaccard, 3),
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def recall_semantic_candidates(
|
||||
items: list[NewsItem],
|
||||
*,
|
||||
existing_candidates: list[dict[str, Any]] | None = None,
|
||||
config: dict[str, Any] | None = None,
|
||||
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
|
||||
config = {**DEFAULT_CONFIG, **(config or {})}
|
||||
existing_candidates = list(existing_candidates or [])
|
||||
if not bool(config.get("enabled", True)):
|
||||
return existing_candidates, {
|
||||
"enabled": False,
|
||||
"input_count": len(items),
|
||||
"existing_candidate_group_count": len(existing_candidates),
|
||||
"added_candidate_group_count": 0,
|
||||
"candidate_group_count": len(existing_candidates),
|
||||
"candidates": existing_candidates,
|
||||
}
|
||||
|
||||
existing_keys = {_pair_key(list(candidate.get("item_ids", []) or [])) for candidate in existing_candidates}
|
||||
pair_counts: defaultdict[str, int] = defaultdict(int)
|
||||
recalled: list[dict[str, Any]] = []
|
||||
|
||||
for index, left in enumerate(items):
|
||||
for right in items[index + 1 :]:
|
||||
if pair_counts[left.id] >= int(config["max_pairs_per_item"]):
|
||||
continue
|
||||
if pair_counts[right.id] >= int(config["max_pairs_per_item"]):
|
||||
continue
|
||||
key = frozenset({left.id, right.id})
|
||||
if key in existing_keys:
|
||||
continue
|
||||
scored = _candidate_score(left, right, config)
|
||||
if scored is None:
|
||||
continue
|
||||
score, reason, evidence = scored
|
||||
recalled.append(
|
||||
{
|
||||
"item_ids": [left.id, right.id],
|
||||
"reason": reason,
|
||||
"score": round(score, 3),
|
||||
"confidence": "medium",
|
||||
**evidence,
|
||||
}
|
||||
)
|
||||
pair_counts[left.id] += 1
|
||||
pair_counts[right.id] += 1
|
||||
if len(recalled) >= int(config["max_pairs"]):
|
||||
break
|
||||
if len(recalled) >= int(config["max_pairs"]):
|
||||
break
|
||||
|
||||
candidates = existing_candidates + recalled
|
||||
report = {
|
||||
"enabled": True,
|
||||
"input_count": len(items),
|
||||
"existing_candidate_group_count": len(existing_candidates),
|
||||
"added_candidate_group_count": len(recalled),
|
||||
"candidate_group_count": len(candidates),
|
||||
"candidates": candidates,
|
||||
}
|
||||
return candidates, report
|
||||
Reference in New Issue
Block a user