Refactor AI daily report pipeline
This commit is contained in:
51
ai_daily_report/sources/rss.py
Normal file
51
ai_daily_report/sources/rss.py
Normal file
@@ -0,0 +1,51 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import xml.etree.ElementTree as ET
|
||||
from email.utils import parsedate_to_datetime
|
||||
from typing import Any, Callable
|
||||
|
||||
from ai_daily_report.models import SourceConfig
|
||||
from ai_daily_report.normalize import clean_text
|
||||
|
||||
|
||||
FetchText = Callable[[str, int], str]
|
||||
|
||||
|
||||
def _parse_pubdate(value: str) -> str | None:
|
||||
if not value:
|
||||
return None
|
||||
try:
|
||||
return parsedate_to_datetime(value).isoformat()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def parse_rss_items(config: SourceConfig, xml_text: str, *, limit: int = 20) -> list[dict[str, Any]]:
|
||||
root = ET.fromstring(xml_text)
|
||||
channel = root.find("channel")
|
||||
raw_items = channel.findall("item") if channel is not None else []
|
||||
items: list[dict[str, Any]] = []
|
||||
for raw in raw_items[:limit]:
|
||||
title = clean_text(raw.findtext("title") or "")
|
||||
if not title:
|
||||
continue
|
||||
summary = clean_text(raw.findtext("description") or "")
|
||||
items.append(
|
||||
{
|
||||
"source_group": config.name,
|
||||
"source_label": config.name,
|
||||
"title_raw": title,
|
||||
"summary_raw": summary,
|
||||
"url": (raw.findtext("link") or "").strip(),
|
||||
"published_at": _parse_pubdate(raw.findtext("pubDate") or ""),
|
||||
"origin_type": "rss",
|
||||
"section_hint": "",
|
||||
"language_hint": "en" if title.encode("utf-8").isascii() else "zh",
|
||||
}
|
||||
)
|
||||
return items
|
||||
|
||||
|
||||
def fetch_rss(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
|
||||
return parse_rss_items(config, fetch_text(config.url, config.timeout_seconds))
|
||||
|
||||
Reference in New Issue
Block a user