from __future__ import annotations import difflib import re from collections import defaultdict from typing import Any from .dedupe import _jaccard_similarity, _title_tokens from .models import NewsItem DEFAULT_CONFIG = { "enabled": True, "max_pairs": 80, "max_pairs_per_item": 5, "title_similarity_threshold": 0.45, "title_jaccard_threshold": 0.25, "summary_jaccard_threshold": 0.18, "strong_entity_overlap_threshold": 2, } STOP_ENTITIES = { "AI", "API", "CLI", "LLM", "Open Source", "GitHub", "Google", "OpenAI", "Anthropic", "Microsoft", "Meta", "Amazon", "NVIDIA", } def _config_value(config: dict[str, Any], name: str): return (config or {}).get(name, DEFAULT_CONFIG[name]) def _text_tokens(value: str) -> set[str]: return _title_tokens(value) def _entity_tokens(value: str) -> set[str]: text = value or "" entities = set(re.findall(r"\b[A-Z][A-Za-z0-9]*(?:[- ][A-Z0-9][A-Za-z0-9]*)*\b", text)) entities.update(re.findall(r"[\u4e00-\u9fffA-Za-z0-9]*[A-Za-z]+[0-9]+[A-Za-z0-9-]*", text)) cleaned = {entity.strip() for entity in entities if len(entity.strip()) >= 3} return {entity for entity in cleaned if entity not in STOP_ENTITIES} def _pair_key(item_ids: list[str]) -> frozenset[str]: return frozenset(item_ids) def _candidate_score(left: NewsItem, right: NewsItem, config: dict[str, Any]) -> tuple[float, str, dict[str, Any]] | None: title_ratio = difflib.SequenceMatcher(None, left.title_norm, right.title_norm).ratio() title_jaccard = _jaccard_similarity(_text_tokens(left.title_norm), _text_tokens(right.title_norm)) summary_jaccard = _jaccard_similarity(_text_tokens(left.summary_raw), _text_tokens(right.summary_raw)) left_entities = _entity_tokens(f"{left.title_raw} {left.summary_raw}") right_entities = _entity_tokens(f"{right.title_raw} {right.summary_raw}") shared_entities = sorted(left_entities & right_entities) strong_entity_threshold = int(_config_value(config, "strong_entity_overlap_threshold")) if len(shared_entities) >= strong_entity_threshold and summary_jaccard > 0: score = min(1.0, 0.55 + len(shared_entities) * 0.1 + summary_jaccard * 0.35) return score, "strong_entity_overlap", { "shared_entities": shared_entities, "title_similarity": round(title_ratio, 3), "title_jaccard": round(title_jaccard, 3), "summary_jaccard": round(summary_jaccard, 3), } if title_ratio >= float(_config_value(config, "title_similarity_threshold")) and ( title_jaccard >= float(_config_value(config, "title_jaccard_threshold")) or summary_jaccard >= float(_config_value(config, "summary_jaccard_threshold")) * 2 or shared_entities ): return title_ratio, "title_similarity", { "title_similarity": round(title_ratio, 3), "title_jaccard": round(title_jaccard, 3), "summary_jaccard": round(summary_jaccard, 3), } if ( title_jaccard >= float(_config_value(config, "title_jaccard_threshold")) and summary_jaccard >= float(_config_value(config, "summary_jaccard_threshold")) ): score = (title_jaccard + summary_jaccard) / 2 return score, "title_summary_jaccard", { "title_similarity": round(title_ratio, 3), "title_jaccard": round(title_jaccard, 3), "summary_jaccard": round(summary_jaccard, 3), } return None def recall_semantic_candidates( items: list[NewsItem], *, existing_candidates: list[dict[str, Any]] | None = None, config: dict[str, Any] | None = None, ) -> tuple[list[dict[str, Any]], dict[str, Any]]: config = {**DEFAULT_CONFIG, **(config or {})} existing_candidates = list(existing_candidates or []) if not bool(config.get("enabled", True)): return existing_candidates, { "enabled": False, "input_count": len(items), "existing_candidate_group_count": len(existing_candidates), "added_candidate_group_count": 0, "candidate_group_count": len(existing_candidates), "candidates": existing_candidates, } existing_keys = {_pair_key(list(candidate.get("item_ids", []) or [])) for candidate in existing_candidates} pair_counts: defaultdict[str, int] = defaultdict(int) recalled: list[dict[str, Any]] = [] for index, left in enumerate(items): for right in items[index + 1 :]: if pair_counts[left.id] >= int(config["max_pairs_per_item"]): continue if pair_counts[right.id] >= int(config["max_pairs_per_item"]): continue key = frozenset({left.id, right.id}) if key in existing_keys: continue scored = _candidate_score(left, right, config) if scored is None: continue score, reason, evidence = scored recalled.append( { "item_ids": [left.id, right.id], "reason": reason, "score": round(score, 3), "confidence": "medium", **evidence, } ) pair_counts[left.id] += 1 pair_counts[right.id] += 1 if len(recalled) >= int(config["max_pairs"]): break if len(recalled) >= int(config["max_pairs"]): break candidates = existing_candidates + recalled report = { "enabled": True, "input_count": len(items), "existing_candidate_group_count": len(existing_candidates), "added_candidate_group_count": len(recalled), "candidate_group_count": len(candidates), "candidates": candidates, } return candidates, report