Refactor AI daily report pipeline
This commit is contained in:
58
ai_daily_report/sources/juya.py
Normal file
58
ai_daily_report/sources/juya.py
Normal file
@@ -0,0 +1,58 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
from typing import Any, Callable
|
||||
|
||||
from ai_daily_report.models import SourceConfig
|
||||
from ai_daily_report.normalize import clean_text
|
||||
from ai_daily_report.sources.labels import source_label_from_url
|
||||
|
||||
|
||||
FetchText = Callable[[str, int], str]
|
||||
|
||||
|
||||
def parse_juya_rss(config: SourceConfig, xml_text: str, run_date: str) -> list[dict[str, Any]]:
|
||||
root = ET.fromstring(xml_text)
|
||||
channel = root.find("channel")
|
||||
raw_items = channel.findall("item") if channel is not None else []
|
||||
article_html = ""
|
||||
for raw in raw_items:
|
||||
if (raw.findtext("title") or "").strip() != run_date:
|
||||
continue
|
||||
content_el = raw.find("{http://purl.org/rss/1.0/modules/content/}encoded")
|
||||
article_html = content_el.text if content_el is not None and content_el.text else ""
|
||||
break
|
||||
if not article_html:
|
||||
return []
|
||||
|
||||
block_pattern = re.compile(
|
||||
r'<h2[^>]*>\s*(?:<a[^>]*href="(?P<title_url>[^"]+)"[^>]*>)?(?P<title_html>[^<]*?)</a>?\s*<code>#(?P<num>\d+)</code>\s*</h2>(?P<body>.*?)(?=<hr\s*/?>\s*<h2|<p><strong>提示</strong>|$)',
|
||||
re.S | re.I,
|
||||
)
|
||||
items: list[dict[str, Any]] = []
|
||||
for match in block_pattern.finditer(article_html):
|
||||
title = clean_text(match.group("title_html") or "")
|
||||
body_html = match.group("body") or ""
|
||||
links = re.findall(r'<a[^>]*href="([^"]+)"[^>]*>', body_html, re.I)
|
||||
url = links[0].replace("&", "&").strip() if links else (match.group("title_url") or "")
|
||||
summary = clean_text(re.sub(r"<[^>]+>", " ", body_html))
|
||||
if title:
|
||||
items.append(
|
||||
{
|
||||
"source_group": config.name,
|
||||
"source_label": source_label_from_url(url, fallback=config.name),
|
||||
"title_raw": title,
|
||||
"summary_raw": summary[:500],
|
||||
"url": url,
|
||||
"published_at": None,
|
||||
"origin_type": "juya_issue",
|
||||
"section_hint": "",
|
||||
"language_hint": "zh",
|
||||
}
|
||||
)
|
||||
return items
|
||||
|
||||
|
||||
def fetch_juya(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
|
||||
return parse_juya_rss(config, fetch_text(config.url, config.timeout_seconds), run_date)
|
||||
Reference in New Issue
Block a user