fix: add cross-day dedupe

This commit is contained in:
Mimikko-zeus
2026-06-08 12:05:45 +08:00
parent 2671aee850
commit 07786e3bc0
16 changed files with 671 additions and 21 deletions

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
import xml.etree.ElementTree as ET
from datetime import date, datetime
from email.utils import parsedate_to_datetime
from typing import Any, Callable
@@ -20,16 +21,57 @@ def _parse_pubdate(value: str) -> str | None:
return None
def parse_rss_items(config: SourceConfig, xml_text: str, *, limit: int = 20) -> list[dict[str, Any]]:
def _parse_run_date(value: str | None) -> date | None:
if not value:
return None
try:
return date.fromisoformat(value[:10])
except ValueError:
return None
def _parse_iso_date(value: str | None) -> date | None:
if not value:
return None
try:
return datetime.fromisoformat(value).date()
except ValueError:
return None
def _within_max_item_age(published_at: str | None, *, run_date: str | None, max_item_age_days: int | None) -> bool:
if max_item_age_days is None:
return True
published_date = _parse_iso_date(published_at)
current_date = _parse_run_date(run_date)
if published_date is None or current_date is None:
return True
return (current_date - published_date).days <= max_item_age_days
def parse_rss_items(
config: SourceConfig,
xml_text: str,
*,
limit: int = 20,
run_date: str | None = None,
) -> list[dict[str, Any]]:
root = ET.fromstring(xml_text)
channel = root.find("channel")
raw_items = channel.findall("item") if channel is not None else []
items: list[dict[str, Any]] = []
for raw in raw_items[:limit]:
for raw in raw_items:
title = clean_text(raw.findtext("title") or "")
if not title:
continue
summary = clean_text(raw.findtext("description") or "")
published_at = _parse_pubdate(raw.findtext("pubDate") or "")
if not _within_max_item_age(
published_at,
run_date=run_date,
max_item_age_days=config.max_item_age_days,
):
continue
items.append(
{
"source_group": config.name,
@@ -37,15 +79,16 @@ def parse_rss_items(config: SourceConfig, xml_text: str, *, limit: int = 20) ->
"title_raw": title,
"summary_raw": summary,
"url": (raw.findtext("link") or "").strip(),
"published_at": _parse_pubdate(raw.findtext("pubDate") or ""),
"published_at": published_at,
"origin_type": "rss",
"section_hint": "",
"language_hint": "en" if title.encode("utf-8").isascii() else "zh",
}
)
if len(items) >= limit:
break
return items
def fetch_rss(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
return parse_rss_items(config, fetch_text(config.url, config.timeout_seconds))
return parse_rss_items(config, fetch_text(config.url, config.timeout_seconds), run_date=run_date)