ai-daily-report/ai_daily_report/sources/juya.py

from __future__ import annotations

import re
import xml.etree.ElementTree as ET
from typing import Any, Callable

from ai_daily_report.models import SourceConfig
from ai_daily_report.normalize import clean_text
from ai_daily_report.sources.labels import source_label_from_url


FetchText = Callable[[str, int], str]


def parse_juya_rss(config: SourceConfig, xml_text: str, run_date: str) -> list[dict[str, Any]]:
    root = ET.fromstring(xml_text)
    channel = root.find("channel")
    raw_items = channel.findall("item") if channel is not None else []
    article_html = ""
    for raw in raw_items:
        if (raw.findtext("title") or "").strip() != run_date:
            continue
        content_el = raw.find("{http://purl.org/rss/1.0/modules/content/}encoded")
        article_html = content_el.text if content_el is not None and content_el.text else ""
        break
    if not article_html:
        return []

    block_pattern = re.compile(
        r'<h2[^>]*>\s*(?:<a[^>]*href="(?P<title_url>[^"]+)"[^>]*>)?(?P<title_html>[^<]*?)</a>?\s*<code>#(?P<num>\d+)</code>\s*</h2>(?P<body>.*?)(?=<hr\s*/?>\s*<h2|<p><strong>提示</strong>|$)',
        re.S | re.I,
    )
    items: list[dict[str, Any]] = []
    for match in block_pattern.finditer(article_html):
        title = clean_text(match.group("title_html") or "")
        body_html = match.group("body") or ""
        links = re.findall(r'<a[^>]*href="([^"]+)"[^>]*>', body_html, re.I)
        url = links[0].replace("&amp;", "&").strip() if links else (match.group("title_url") or "")
        summary = clean_text(re.sub(r"<[^>]+>", " ", body_html))
        if title:
            items.append(
                {
                    "source_group": config.name,
                    "source_label": source_label_from_url(url, fallback=config.name),
                    "title_raw": title,
                    "summary_raw": summary[:500],
                    "url": url,
                    "published_at": None,
                    "origin_type": "juya_issue",
                    "section_hint": "",
                    "language_hint": "zh",
                }
            )
    return items


def fetch_juya(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
    return parse_juya_rss(config, fetch_text(config.url, config.timeout_seconds), run_date)