Files
ai-daily-report/ai_daily_report/sources/juya.py
2026-06-04 15:21:56 +08:00

59 lines
2.3 KiB
Python

from __future__ import annotations
import re
import xml.etree.ElementTree as ET
from typing import Any, Callable
from ai_daily_report.models import SourceConfig
from ai_daily_report.normalize import clean_text
from ai_daily_report.sources.labels import source_label_from_url
FetchText = Callable[[str, int], str]
def parse_juya_rss(config: SourceConfig, xml_text: str, run_date: str) -> list[dict[str, Any]]:
root = ET.fromstring(xml_text)
channel = root.find("channel")
raw_items = channel.findall("item") if channel is not None else []
article_html = ""
for raw in raw_items:
if (raw.findtext("title") or "").strip() != run_date:
continue
content_el = raw.find("{http://purl.org/rss/1.0/modules/content/}encoded")
article_html = content_el.text if content_el is not None and content_el.text else ""
break
if not article_html:
return []
block_pattern = re.compile(
r'<h2[^>]*>\s*(?:<a[^>]*href="(?P<title_url>[^"]+)"[^>]*>)?(?P<title_html>[^<]*?)</a>?\s*<code>#(?P<num>\d+)</code>\s*</h2>(?P<body>.*?)(?=<hr\s*/?>\s*<h2|<p><strong>提示</strong>|$)',
re.S | re.I,
)
items: list[dict[str, Any]] = []
for match in block_pattern.finditer(article_html):
title = clean_text(match.group("title_html") or "")
body_html = match.group("body") or ""
links = re.findall(r'<a[^>]*href="([^"]+)"[^>]*>', body_html, re.I)
url = links[0].replace("&amp;", "&").strip() if links else (match.group("title_url") or "")
summary = clean_text(re.sub(r"<[^>]+>", " ", body_html))
if title:
items.append(
{
"source_group": config.name,
"source_label": source_label_from_url(url, fallback=config.name),
"title_raw": title,
"summary_raw": summary[:500],
"url": url,
"published_at": None,
"origin_type": "juya_issue",
"section_hint": "",
"language_hint": "zh",
}
)
return items
def fetch_juya(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
return parse_juya_rss(config, fetch_text(config.url, config.timeout_seconds), run_date)