ai-daily-report/ai_daily_report/candidate_recall.py

from __future__ import annotations

import difflib
import re
from collections import defaultdict
from typing import Any

from .dedupe import _jaccard_similarity, _title_tokens
from .models import NewsItem


DEFAULT_CONFIG = {
    "enabled": True,
    "max_pairs": 80,
    "max_pairs_per_item": 5,
    "title_similarity_threshold": 0.45,
    "title_jaccard_threshold": 0.25,
    "summary_jaccard_threshold": 0.18,
    "strong_entity_overlap_threshold": 2,
}

STOP_ENTITIES = {
    "AI",
    "API",
    "CLI",
    "LLM",
    "Open Source",
    "GitHub",
    "Google",
    "OpenAI",
    "Anthropic",
    "Microsoft",
    "Meta",
    "Amazon",
    "NVIDIA",
}


def _config_value(config: dict[str, Any], name: str):
    return (config or {}).get(name, DEFAULT_CONFIG[name])


def _text_tokens(value: str) -> set[str]:
    return _title_tokens(value)


def _entity_tokens(value: str) -> set[str]:
    text = value or ""
    entities = set(re.findall(r"\b[A-Z][A-Za-z0-9]*(?:[- ][A-Z0-9][A-Za-z0-9]*)*\b", text))
    entities.update(re.findall(r"[\u4e00-\u9fffA-Za-z0-9]*[A-Za-z]+[0-9]+[A-Za-z0-9-]*", text))
    cleaned = {entity.strip() for entity in entities if len(entity.strip()) >= 3}
    return {entity for entity in cleaned if entity not in STOP_ENTITIES}


def _pair_key(item_ids: list[str]) -> frozenset[str]:
    return frozenset(item_ids)


def _candidate_score(left: NewsItem, right: NewsItem, config: dict[str, Any]) -> tuple[float, str, dict[str, Any]] | None:
    title_ratio = difflib.SequenceMatcher(None, left.title_norm, right.title_norm).ratio()
    title_jaccard = _jaccard_similarity(_text_tokens(left.title_norm), _text_tokens(right.title_norm))
    summary_jaccard = _jaccard_similarity(_text_tokens(left.summary_raw), _text_tokens(right.summary_raw))
    left_entities = _entity_tokens(f"{left.title_raw} {left.summary_raw}")
    right_entities = _entity_tokens(f"{right.title_raw} {right.summary_raw}")
    shared_entities = sorted(left_entities & right_entities)
    strong_entity_threshold = int(_config_value(config, "strong_entity_overlap_threshold"))

    if len(shared_entities) >= strong_entity_threshold and summary_jaccard > 0:
        score = min(1.0, 0.55 + len(shared_entities) * 0.1 + summary_jaccard * 0.35)
        return score, "strong_entity_overlap", {
            "shared_entities": shared_entities,
            "title_similarity": round(title_ratio, 3),
            "title_jaccard": round(title_jaccard, 3),
            "summary_jaccard": round(summary_jaccard, 3),
        }

    if title_ratio >= float(_config_value(config, "title_similarity_threshold")) and (
        title_jaccard >= float(_config_value(config, "title_jaccard_threshold"))
        or summary_jaccard >= float(_config_value(config, "summary_jaccard_threshold")) * 2
        or shared_entities
    ):
        return title_ratio, "title_similarity", {
            "title_similarity": round(title_ratio, 3),
            "title_jaccard": round(title_jaccard, 3),
            "summary_jaccard": round(summary_jaccard, 3),
        }

    if (
        title_jaccard >= float(_config_value(config, "title_jaccard_threshold"))
        and summary_jaccard >= float(_config_value(config, "summary_jaccard_threshold"))
    ):
        score = (title_jaccard + summary_jaccard) / 2
        return score, "title_summary_jaccard", {
            "title_similarity": round(title_ratio, 3),
            "title_jaccard": round(title_jaccard, 3),
            "summary_jaccard": round(summary_jaccard, 3),
        }

    return None


def recall_semantic_candidates(
    items: list[NewsItem],
    *,
    existing_candidates: list[dict[str, Any]] | None = None,
    config: dict[str, Any] | None = None,
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
    config = {**DEFAULT_CONFIG, **(config or {})}
    existing_candidates = list(existing_candidates or [])
    if not bool(config.get("enabled", True)):
        return existing_candidates, {
            "enabled": False,
            "input_count": len(items),
            "existing_candidate_group_count": len(existing_candidates),
            "added_candidate_group_count": 0,
            "candidate_group_count": len(existing_candidates),
            "candidates": existing_candidates,
        }

    existing_keys = {_pair_key(list(candidate.get("item_ids", []) or [])) for candidate in existing_candidates}
    pair_counts: defaultdict[str, int] = defaultdict(int)
    recalled: list[dict[str, Any]] = []

    for index, left in enumerate(items):
        for right in items[index + 1 :]:
            if pair_counts[left.id] >= int(config["max_pairs_per_item"]):
                continue
            if pair_counts[right.id] >= int(config["max_pairs_per_item"]):
                continue
            key = frozenset({left.id, right.id})
            if key in existing_keys:
                continue
            scored = _candidate_score(left, right, config)
            if scored is None:
                continue
            score, reason, evidence = scored
            recalled.append(
                {
                    "item_ids": [left.id, right.id],
                    "reason": reason,
                    "score": round(score, 3),
                    "confidence": "medium",
                    **evidence,
                }
            )
            pair_counts[left.id] += 1
            pair_counts[right.id] += 1
            if len(recalled) >= int(config["max_pairs"]):
                break
        if len(recalled) >= int(config["max_pairs"]):
            break

    candidates = existing_candidates + recalled
    report = {
        "enabled": True,
        "input_count": len(items),
        "existing_candidate_group_count": len(existing_candidates),
        "added_candidate_group_count": len(recalled),
        "candidate_group_count": len(candidates),
        "candidates": candidates,
    }
    return candidates, report