59 lines
2.3 KiB
Python
59 lines
2.3 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
import xml.etree.ElementTree as ET
|
|
from typing import Any, Callable
|
|
|
|
from ai_daily_report.models import SourceConfig
|
|
from ai_daily_report.normalize import clean_text
|
|
from ai_daily_report.sources.labels import source_label_from_url
|
|
|
|
|
|
FetchText = Callable[[str, int], str]
|
|
|
|
|
|
def parse_juya_rss(config: SourceConfig, xml_text: str, run_date: str) -> list[dict[str, Any]]:
|
|
root = ET.fromstring(xml_text)
|
|
channel = root.find("channel")
|
|
raw_items = channel.findall("item") if channel is not None else []
|
|
article_html = ""
|
|
for raw in raw_items:
|
|
if (raw.findtext("title") or "").strip() != run_date:
|
|
continue
|
|
content_el = raw.find("{http://purl.org/rss/1.0/modules/content/}encoded")
|
|
article_html = content_el.text if content_el is not None and content_el.text else ""
|
|
break
|
|
if not article_html:
|
|
return []
|
|
|
|
block_pattern = re.compile(
|
|
r'<h2[^>]*>\s*(?:<a[^>]*href="(?P<title_url>[^"]+)"[^>]*>)?(?P<title_html>[^<]*?)</a>?\s*<code>#(?P<num>\d+)</code>\s*</h2>(?P<body>.*?)(?=<hr\s*/?>\s*<h2|<p><strong>提示</strong>|$)',
|
|
re.S | re.I,
|
|
)
|
|
items: list[dict[str, Any]] = []
|
|
for match in block_pattern.finditer(article_html):
|
|
title = clean_text(match.group("title_html") or "")
|
|
body_html = match.group("body") or ""
|
|
links = re.findall(r'<a[^>]*href="([^"]+)"[^>]*>', body_html, re.I)
|
|
url = links[0].replace("&", "&").strip() if links else (match.group("title_url") or "")
|
|
summary = clean_text(re.sub(r"<[^>]+>", " ", body_html))
|
|
if title:
|
|
items.append(
|
|
{
|
|
"source_group": config.name,
|
|
"source_label": source_label_from_url(url, fallback=config.name),
|
|
"title_raw": title,
|
|
"summary_raw": summary[:500],
|
|
"url": url,
|
|
"published_at": None,
|
|
"origin_type": "juya_issue",
|
|
"section_hint": "",
|
|
"language_hint": "zh",
|
|
}
|
|
)
|
|
return items
|
|
|
|
|
|
def fetch_juya(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
|
|
return parse_juya_rss(config, fetch_text(config.url, config.timeout_seconds), run_date)
|