from __future__ import annotations import re import xml.etree.ElementTree as ET from typing import Any, Callable from ai_daily_report.models import SourceConfig from ai_daily_report.normalize import clean_text from ai_daily_report.sources.labels import source_label_from_url FetchText = Callable[[str, int], str] def parse_juya_rss(config: SourceConfig, xml_text: str, run_date: str) -> list[dict[str, Any]]: root = ET.fromstring(xml_text) channel = root.find("channel") raw_items = channel.findall("item") if channel is not None else [] article_html = "" for raw in raw_items: if (raw.findtext("title") or "").strip() != run_date: continue content_el = raw.find("{http://purl.org/rss/1.0/modules/content/}encoded") article_html = content_el.text if content_el is not None and content_el.text else "" break if not article_html: return [] block_pattern = re.compile( r']*>\s*(?:]*href="(?P[^"]+)"[^>]*>)?(?P[^<]*?)?\s*#(?P\d+)\s*(?P.*?)(?=\s*提示|$)', re.S | re.I, ) items: list[dict[str, Any]] = [] for match in block_pattern.finditer(article_html): title = clean_text(match.group("title_html") or "") body_html = match.group("body") or "" links = re.findall(r']*href="([^"]+)"[^>]*>', body_html, re.I) url = links[0].replace("&", "&").strip() if links else (match.group("title_url") or "") summary = clean_text(re.sub(r"<[^>]+>", " ", body_html)) if title: items.append( { "source_group": config.name, "source_label": source_label_from_url(url, fallback=config.name), "title_raw": title, "summary_raw": summary[:500], "url": url, "published_at": None, "origin_type": "juya_issue", "section_hint": "", "language_hint": "zh", } ) return items def fetch_juya(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]: return parse_juya_rss(config, fetch_text(config.url, config.timeout_seconds), run_date)