fix: add cross-day dedupe
This commit is contained in:
@@ -1,9 +1,16 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import difflib
|
||||
import re
|
||||
from datetime import date, datetime
|
||||
from typing import Any
|
||||
|
||||
from .models import NewsItem
|
||||
from .models import NewsItem, PublishedUrlEntry, PublishedUrls
|
||||
|
||||
|
||||
TITLE_SIMILARITY_THRESHOLD = 0.50
|
||||
TOKEN_JACCARD_THRESHOLD = 0.40
|
||||
TOKEN_EDIT_DISTANCE_THRESHOLD = 0.40
|
||||
|
||||
|
||||
def _item_score(item: NewsItem) -> int:
|
||||
@@ -52,6 +59,18 @@ def _group_by_key(items: list[NewsItem], key_name: str) -> dict[str, list[NewsIt
|
||||
return {key: group for key, group in groups.items() if len(group) > 1}
|
||||
|
||||
|
||||
def _title_tokens(value: str) -> set[str]:
|
||||
if not value:
|
||||
return set()
|
||||
return set(re.findall(r"[a-z0-9]+|[\u4e00-\u9fff]", value.lower()))
|
||||
|
||||
|
||||
def _jaccard_similarity(left: set[str], right: set[str]) -> float:
|
||||
if not left or not right:
|
||||
return 0.0
|
||||
return len(left & right) / len(left | right)
|
||||
|
||||
|
||||
def _possible_duplicates(items: list[NewsItem]) -> list[dict[str, Any]]:
|
||||
possible: list[dict[str, Any]] = []
|
||||
for index, left in enumerate(items):
|
||||
@@ -59,12 +78,16 @@ def _possible_duplicates(items: list[NewsItem]) -> list[dict[str, Any]]:
|
||||
if not left.title_norm or not right.title_norm:
|
||||
continue
|
||||
ratio = difflib.SequenceMatcher(None, left.title_norm, right.title_norm).ratio()
|
||||
if ratio >= 0.65:
|
||||
jaccard = _jaccard_similarity(_title_tokens(left.title_norm), _title_tokens(right.title_norm))
|
||||
if ratio >= TITLE_SIMILARITY_THRESHOLD or (
|
||||
ratio >= TOKEN_EDIT_DISTANCE_THRESHOLD and jaccard >= TOKEN_JACCARD_THRESHOLD
|
||||
):
|
||||
possible.append(
|
||||
{
|
||||
"item_ids": [left.id, right.id],
|
||||
"reason": "title_similarity",
|
||||
"similarity": round(ratio, 3),
|
||||
"token_jaccard": round(jaccard, 3),
|
||||
"confidence": "medium",
|
||||
}
|
||||
)
|
||||
@@ -98,3 +121,62 @@ def hard_dedup_items(items: list[NewsItem]) -> tuple[list[NewsItem], dict[str, A
|
||||
"possible_duplicates": _possible_duplicates(deduped),
|
||||
}
|
||||
return deduped, report
|
||||
|
||||
|
||||
def _parse_date(value: str | None) -> date | None:
|
||||
if not value:
|
||||
return None
|
||||
text = value.strip()
|
||||
try:
|
||||
return date.fromisoformat(text[:10])
|
||||
except ValueError:
|
||||
try:
|
||||
return datetime.fromisoformat(text).date()
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _entry_within_window(entry: PublishedUrlEntry, *, run_date: str, max_age_days: int) -> bool:
|
||||
if max_age_days < 0:
|
||||
return True
|
||||
current = _parse_date(run_date)
|
||||
previous = _parse_date(entry.last_published) or _parse_date(entry.first_seen)
|
||||
if current is None or previous is None:
|
||||
return True
|
||||
return (current - previous).days <= max_age_days
|
||||
|
||||
|
||||
def cross_day_dedup_items(
|
||||
items: list[NewsItem],
|
||||
published_urls: PublishedUrls | None,
|
||||
*,
|
||||
run_date: str,
|
||||
max_age_days: int = 7,
|
||||
) -> tuple[list[NewsItem], dict[str, Any]]:
|
||||
history = published_urls or PublishedUrls()
|
||||
deduped: list[NewsItem] = []
|
||||
removed: list[dict[str, Any]] = []
|
||||
|
||||
for item in items:
|
||||
entry = history.urls.get(item.canonical_url) if item.canonical_url else None
|
||||
if entry and _entry_within_window(entry, run_date=run_date, max_age_days=max_age_days):
|
||||
removed.append(
|
||||
{
|
||||
"item_id": item.id,
|
||||
"canonical_url": item.canonical_url,
|
||||
"title": item.title or item.title_raw,
|
||||
"first_seen": entry.first_seen,
|
||||
"last_published": entry.last_published,
|
||||
}
|
||||
)
|
||||
continue
|
||||
deduped.append(item)
|
||||
|
||||
report = {
|
||||
"input_count": len(items),
|
||||
"output_count": len(deduped),
|
||||
"removed_count": len(removed),
|
||||
"removed": removed,
|
||||
"max_age_days": max_age_days,
|
||||
}
|
||||
return deduped, report
|
||||
|
||||
Reference in New Issue
Block a user