Add Stage 2.8 recall, quality gate, retries, and publish idempotency

This commit is contained in:
Mimikko-zeus
2026-06-10 21:31:13 +08:00
parent 07786e3bc0
commit b46cef2c7b
16 changed files with 1253 additions and 6 deletions

View File

@@ -0,0 +1,162 @@
from __future__ import annotations
import difflib
import re
from collections import defaultdict
from typing import Any
from .dedupe import _jaccard_similarity, _title_tokens
from .models import NewsItem
DEFAULT_CONFIG = {
"enabled": True,
"max_pairs": 80,
"max_pairs_per_item": 5,
"title_similarity_threshold": 0.45,
"title_jaccard_threshold": 0.25,
"summary_jaccard_threshold": 0.18,
"strong_entity_overlap_threshold": 2,
}
STOP_ENTITIES = {
"AI",
"API",
"CLI",
"LLM",
"Open Source",
"GitHub",
"Google",
"OpenAI",
"Anthropic",
"Microsoft",
"Meta",
"Amazon",
"NVIDIA",
}
def _config_value(config: dict[str, Any], name: str):
return (config or {}).get(name, DEFAULT_CONFIG[name])
def _text_tokens(value: str) -> set[str]:
return _title_tokens(value)
def _entity_tokens(value: str) -> set[str]:
text = value or ""
entities = set(re.findall(r"\b[A-Z][A-Za-z0-9]*(?:[- ][A-Z0-9][A-Za-z0-9]*)*\b", text))
entities.update(re.findall(r"[\u4e00-\u9fffA-Za-z0-9]*[A-Za-z]+[0-9]+[A-Za-z0-9-]*", text))
cleaned = {entity.strip() for entity in entities if len(entity.strip()) >= 3}
return {entity for entity in cleaned if entity not in STOP_ENTITIES}
def _pair_key(item_ids: list[str]) -> frozenset[str]:
return frozenset(item_ids)
def _candidate_score(left: NewsItem, right: NewsItem, config: dict[str, Any]) -> tuple[float, str, dict[str, Any]] | None:
title_ratio = difflib.SequenceMatcher(None, left.title_norm, right.title_norm).ratio()
title_jaccard = _jaccard_similarity(_text_tokens(left.title_norm), _text_tokens(right.title_norm))
summary_jaccard = _jaccard_similarity(_text_tokens(left.summary_raw), _text_tokens(right.summary_raw))
left_entities = _entity_tokens(f"{left.title_raw} {left.summary_raw}")
right_entities = _entity_tokens(f"{right.title_raw} {right.summary_raw}")
shared_entities = sorted(left_entities & right_entities)
strong_entity_threshold = int(_config_value(config, "strong_entity_overlap_threshold"))
if len(shared_entities) >= strong_entity_threshold and summary_jaccard > 0:
score = min(1.0, 0.55 + len(shared_entities) * 0.1 + summary_jaccard * 0.35)
return score, "strong_entity_overlap", {
"shared_entities": shared_entities,
"title_similarity": round(title_ratio, 3),
"title_jaccard": round(title_jaccard, 3),
"summary_jaccard": round(summary_jaccard, 3),
}
if title_ratio >= float(_config_value(config, "title_similarity_threshold")) and (
title_jaccard >= float(_config_value(config, "title_jaccard_threshold"))
or summary_jaccard >= float(_config_value(config, "summary_jaccard_threshold")) * 2
or shared_entities
):
return title_ratio, "title_similarity", {
"title_similarity": round(title_ratio, 3),
"title_jaccard": round(title_jaccard, 3),
"summary_jaccard": round(summary_jaccard, 3),
}
if (
title_jaccard >= float(_config_value(config, "title_jaccard_threshold"))
and summary_jaccard >= float(_config_value(config, "summary_jaccard_threshold"))
):
score = (title_jaccard + summary_jaccard) / 2
return score, "title_summary_jaccard", {
"title_similarity": round(title_ratio, 3),
"title_jaccard": round(title_jaccard, 3),
"summary_jaccard": round(summary_jaccard, 3),
}
return None
def recall_semantic_candidates(
items: list[NewsItem],
*,
existing_candidates: list[dict[str, Any]] | None = None,
config: dict[str, Any] | None = None,
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
config = {**DEFAULT_CONFIG, **(config or {})}
existing_candidates = list(existing_candidates or [])
if not bool(config.get("enabled", True)):
return existing_candidates, {
"enabled": False,
"input_count": len(items),
"existing_candidate_group_count": len(existing_candidates),
"added_candidate_group_count": 0,
"candidate_group_count": len(existing_candidates),
"candidates": existing_candidates,
}
existing_keys = {_pair_key(list(candidate.get("item_ids", []) or [])) for candidate in existing_candidates}
pair_counts: defaultdict[str, int] = defaultdict(int)
recalled: list[dict[str, Any]] = []
for index, left in enumerate(items):
for right in items[index + 1 :]:
if pair_counts[left.id] >= int(config["max_pairs_per_item"]):
continue
if pair_counts[right.id] >= int(config["max_pairs_per_item"]):
continue
key = frozenset({left.id, right.id})
if key in existing_keys:
continue
scored = _candidate_score(left, right, config)
if scored is None:
continue
score, reason, evidence = scored
recalled.append(
{
"item_ids": [left.id, right.id],
"reason": reason,
"score": round(score, 3),
"confidence": "medium",
**evidence,
}
)
pair_counts[left.id] += 1
pair_counts[right.id] += 1
if len(recalled) >= int(config["max_pairs"]):
break
if len(recalled) >= int(config["max_pairs"]):
break
candidates = existing_candidates + recalled
report = {
"enabled": True,
"input_count": len(items),
"existing_candidate_group_count": len(existing_candidates),
"added_candidate_group_count": len(recalled),
"candidate_group_count": len(candidates),
"candidates": candidates,
}
return candidates, report