163 lines
5.8 KiB
Python
163 lines
5.8 KiB
Python
from __future__ import annotations
|
|
|
|
import difflib
|
|
import re
|
|
from collections import defaultdict
|
|
from typing import Any
|
|
|
|
from .dedupe import _jaccard_similarity, _title_tokens
|
|
from .models import NewsItem
|
|
|
|
|
|
DEFAULT_CONFIG = {
|
|
"enabled": True,
|
|
"max_pairs": 80,
|
|
"max_pairs_per_item": 5,
|
|
"title_similarity_threshold": 0.45,
|
|
"title_jaccard_threshold": 0.25,
|
|
"summary_jaccard_threshold": 0.18,
|
|
"strong_entity_overlap_threshold": 2,
|
|
}
|
|
|
|
STOP_ENTITIES = {
|
|
"AI",
|
|
"API",
|
|
"CLI",
|
|
"LLM",
|
|
"Open Source",
|
|
"GitHub",
|
|
"Google",
|
|
"OpenAI",
|
|
"Anthropic",
|
|
"Microsoft",
|
|
"Meta",
|
|
"Amazon",
|
|
"NVIDIA",
|
|
}
|
|
|
|
|
|
def _config_value(config: dict[str, Any], name: str):
|
|
return (config or {}).get(name, DEFAULT_CONFIG[name])
|
|
|
|
|
|
def _text_tokens(value: str) -> set[str]:
|
|
return _title_tokens(value)
|
|
|
|
|
|
def _entity_tokens(value: str) -> set[str]:
|
|
text = value or ""
|
|
entities = set(re.findall(r"\b[A-Z][A-Za-z0-9]*(?:[- ][A-Z0-9][A-Za-z0-9]*)*\b", text))
|
|
entities.update(re.findall(r"[\u4e00-\u9fffA-Za-z0-9]*[A-Za-z]+[0-9]+[A-Za-z0-9-]*", text))
|
|
cleaned = {entity.strip() for entity in entities if len(entity.strip()) >= 3}
|
|
return {entity for entity in cleaned if entity not in STOP_ENTITIES}
|
|
|
|
|
|
def _pair_key(item_ids: list[str]) -> frozenset[str]:
|
|
return frozenset(item_ids)
|
|
|
|
|
|
def _candidate_score(left: NewsItem, right: NewsItem, config: dict[str, Any]) -> tuple[float, str, dict[str, Any]] | None:
|
|
title_ratio = difflib.SequenceMatcher(None, left.title_norm, right.title_norm).ratio()
|
|
title_jaccard = _jaccard_similarity(_text_tokens(left.title_norm), _text_tokens(right.title_norm))
|
|
summary_jaccard = _jaccard_similarity(_text_tokens(left.summary_raw), _text_tokens(right.summary_raw))
|
|
left_entities = _entity_tokens(f"{left.title_raw} {left.summary_raw}")
|
|
right_entities = _entity_tokens(f"{right.title_raw} {right.summary_raw}")
|
|
shared_entities = sorted(left_entities & right_entities)
|
|
strong_entity_threshold = int(_config_value(config, "strong_entity_overlap_threshold"))
|
|
|
|
if len(shared_entities) >= strong_entity_threshold and summary_jaccard > 0:
|
|
score = min(1.0, 0.55 + len(shared_entities) * 0.1 + summary_jaccard * 0.35)
|
|
return score, "strong_entity_overlap", {
|
|
"shared_entities": shared_entities,
|
|
"title_similarity": round(title_ratio, 3),
|
|
"title_jaccard": round(title_jaccard, 3),
|
|
"summary_jaccard": round(summary_jaccard, 3),
|
|
}
|
|
|
|
if title_ratio >= float(_config_value(config, "title_similarity_threshold")) and (
|
|
title_jaccard >= float(_config_value(config, "title_jaccard_threshold"))
|
|
or summary_jaccard >= float(_config_value(config, "summary_jaccard_threshold")) * 2
|
|
or shared_entities
|
|
):
|
|
return title_ratio, "title_similarity", {
|
|
"title_similarity": round(title_ratio, 3),
|
|
"title_jaccard": round(title_jaccard, 3),
|
|
"summary_jaccard": round(summary_jaccard, 3),
|
|
}
|
|
|
|
if (
|
|
title_jaccard >= float(_config_value(config, "title_jaccard_threshold"))
|
|
and summary_jaccard >= float(_config_value(config, "summary_jaccard_threshold"))
|
|
):
|
|
score = (title_jaccard + summary_jaccard) / 2
|
|
return score, "title_summary_jaccard", {
|
|
"title_similarity": round(title_ratio, 3),
|
|
"title_jaccard": round(title_jaccard, 3),
|
|
"summary_jaccard": round(summary_jaccard, 3),
|
|
}
|
|
|
|
return None
|
|
|
|
|
|
def recall_semantic_candidates(
|
|
items: list[NewsItem],
|
|
*,
|
|
existing_candidates: list[dict[str, Any]] | None = None,
|
|
config: dict[str, Any] | None = None,
|
|
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
|
|
config = {**DEFAULT_CONFIG, **(config or {})}
|
|
existing_candidates = list(existing_candidates or [])
|
|
if not bool(config.get("enabled", True)):
|
|
return existing_candidates, {
|
|
"enabled": False,
|
|
"input_count": len(items),
|
|
"existing_candidate_group_count": len(existing_candidates),
|
|
"added_candidate_group_count": 0,
|
|
"candidate_group_count": len(existing_candidates),
|
|
"candidates": existing_candidates,
|
|
}
|
|
|
|
existing_keys = {_pair_key(list(candidate.get("item_ids", []) or [])) for candidate in existing_candidates}
|
|
pair_counts: defaultdict[str, int] = defaultdict(int)
|
|
recalled: list[dict[str, Any]] = []
|
|
|
|
for index, left in enumerate(items):
|
|
for right in items[index + 1 :]:
|
|
if pair_counts[left.id] >= int(config["max_pairs_per_item"]):
|
|
continue
|
|
if pair_counts[right.id] >= int(config["max_pairs_per_item"]):
|
|
continue
|
|
key = frozenset({left.id, right.id})
|
|
if key in existing_keys:
|
|
continue
|
|
scored = _candidate_score(left, right, config)
|
|
if scored is None:
|
|
continue
|
|
score, reason, evidence = scored
|
|
recalled.append(
|
|
{
|
|
"item_ids": [left.id, right.id],
|
|
"reason": reason,
|
|
"score": round(score, 3),
|
|
"confidence": "medium",
|
|
**evidence,
|
|
}
|
|
)
|
|
pair_counts[left.id] += 1
|
|
pair_counts[right.id] += 1
|
|
if len(recalled) >= int(config["max_pairs"]):
|
|
break
|
|
if len(recalled) >= int(config["max_pairs"]):
|
|
break
|
|
|
|
candidates = existing_candidates + recalled
|
|
report = {
|
|
"enabled": True,
|
|
"input_count": len(items),
|
|
"existing_candidate_group_count": len(existing_candidates),
|
|
"added_candidate_group_count": len(recalled),
|
|
"candidate_group_count": len(candidates),
|
|
"candidates": candidates,
|
|
}
|
|
return candidates, report
|