Refactor AI daily report pipeline

This commit is contained in:
Mimikko-zeus
2026-06-04 15:21:56 +08:00
parent 94e18ce22d
commit 5a98696255
64 changed files with 4778 additions and 1316 deletions

View File

@@ -0,0 +1,2 @@
"""Source adapters for the AI daily report pipeline."""

View File

@@ -0,0 +1,32 @@
from __future__ import annotations
import json
from typing import Any, Callable
from ai_daily_report.models import SourceConfig
FetchText = Callable[[str, int], str]
def fetch_aihot(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
data = json.loads(fetch_text(f"https://aihot.virxact.com/api/public/daily/{run_date}", config.timeout_seconds))
items: list[dict[str, Any]] = []
generated = data.get("generatedAt")
for section in data.get("sections", []) or []:
for raw in section.get("items", []) or []:
items.append(
{
"source_group": config.name,
"source_label": raw.get("sourceName") or config.name,
"title_raw": raw.get("title") or "",
"summary_raw": raw.get("summary") or "",
"url": raw.get("sourceUrl") or "",
"published_at": generated,
"origin_type": "aihot_json",
"section_hint": section.get("label") or "",
"language_hint": "zh",
}
)
return items

View File

@@ -0,0 +1,58 @@
from __future__ import annotations
import re
import xml.etree.ElementTree as ET
from typing import Any, Callable
from ai_daily_report.models import SourceConfig
from ai_daily_report.normalize import clean_text
from ai_daily_report.sources.labels import source_label_from_url
FetchText = Callable[[str, int], str]
def parse_juya_rss(config: SourceConfig, xml_text: str, run_date: str) -> list[dict[str, Any]]:
root = ET.fromstring(xml_text)
channel = root.find("channel")
raw_items = channel.findall("item") if channel is not None else []
article_html = ""
for raw in raw_items:
if (raw.findtext("title") or "").strip() != run_date:
continue
content_el = raw.find("{http://purl.org/rss/1.0/modules/content/}encoded")
article_html = content_el.text if content_el is not None and content_el.text else ""
break
if not article_html:
return []
block_pattern = re.compile(
r'<h2[^>]*>\s*(?:<a[^>]*href="(?P<title_url>[^"]+)"[^>]*>)?(?P<title_html>[^<]*?)</a>?\s*<code>#(?P<num>\d+)</code>\s*</h2>(?P<body>.*?)(?=<hr\s*/?>\s*<h2|<p><strong>提示</strong>|$)',
re.S | re.I,
)
items: list[dict[str, Any]] = []
for match in block_pattern.finditer(article_html):
title = clean_text(match.group("title_html") or "")
body_html = match.group("body") or ""
links = re.findall(r'<a[^>]*href="([^"]+)"[^>]*>', body_html, re.I)
url = links[0].replace("&amp;", "&").strip() if links else (match.group("title_url") or "")
summary = clean_text(re.sub(r"<[^>]+>", " ", body_html))
if title:
items.append(
{
"source_group": config.name,
"source_label": source_label_from_url(url, fallback=config.name),
"title_raw": title,
"summary_raw": summary[:500],
"url": url,
"published_at": None,
"origin_type": "juya_issue",
"section_hint": "",
"language_hint": "zh",
}
)
return items
def fetch_juya(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
return parse_juya_rss(config, fetch_text(config.url, config.timeout_seconds), run_date)

View File

@@ -0,0 +1,78 @@
from __future__ import annotations
from urllib.parse import urlparse
DOMAIN_LABELS = {
"anthropic.com": "Anthropic",
"arxiv.org": "arXiv",
"bloomberg.com": "Bloomberg",
"deepseek.com": "DeepSeek",
"github.blog": "GitHub Blog",
"github.com": "GitHub",
"huggingface.co": "Hugging Face",
"infoq.com": "InfoQ",
"mp.weixin.qq.com": "微信公众号",
"openai.com": "OpenAI",
"platform.minimaxi.com": "MiniMaxDocs",
"qbitai.com": "量子位",
"techcrunch.com": "TechCrunch",
"technologyreview.com": "MIT科技评论AI",
"theverge.com": "The Verge",
"x.com": "X",
"twitter.com": "X",
}
X_DISPLAY_NAMES = {
"MiniMax_AI": "MiniMax",
"OpenAIDevs": "OpenAI Developers",
"openai": "OpenAI",
"openclaw": "OpenClaw",
"xai": "xAI",
"krea_ai": "Krea AI",
"nvidia": "NVIDIA",
"NVIDIAAI": "NVIDIA AI",
"alibaba_cloud": "阿里云 / Alibaba Cloud",
"cb_doge": "cb_doge",
}
def _host(url: str) -> str:
host = (urlparse(url).netloc or "").lower()
return host[4:] if host.startswith("www.") else host
def _domain_label(host: str) -> str:
for domain, label in DOMAIN_LABELS.items():
if host == domain or host.endswith("." + domain):
return label
return host
def _x_handle(url: str) -> str:
parts = [part for part in urlparse(url).path.split("/") if part]
if not parts:
return ""
handle = parts[0]
if handle in {"i", "search", "explore", "settings", "notifications", "home", "compose"}:
return ""
return handle
def source_label_from_url(url: str, *, fallback: str = "来源") -> str:
if not url:
return fallback
host = _host(url)
if host in {"x.com", "twitter.com"}:
handle = _x_handle(url)
if handle:
display = X_DISPLAY_NAMES.get(handle, handle)
return f"X{display} (@{handle})"
return "X"
label = _domain_label(host)
parsed = urlparse(url)
path = (parsed.path or "").lower()
if label and ("blog" in host or "/blog" in path or "/research" in path):
return f"{label}Blog"
return label or fallback

View File

@@ -0,0 +1,24 @@
from __future__ import annotations
from typing import Callable
from ai_daily_report.models import SourceConfig
from ai_daily_report.sources.aihot import fetch_aihot
from ai_daily_report.sources.juya import fetch_juya
from ai_daily_report.sources.rss import fetch_rss
SourceFetcher = Callable[[SourceConfig, str, Callable[[str, int], str]], list[dict]]
SOURCE_FETCHERS: dict[str, SourceFetcher] = {
"aihot": fetch_aihot,
"rss": fetch_rss,
"juya_rss": fetch_juya,
}
def get_source_fetcher(source_type: str) -> SourceFetcher:
if source_type not in SOURCE_FETCHERS:
raise KeyError(f"Unknown source type: {source_type}")
return SOURCE_FETCHERS[source_type]

View File

@@ -0,0 +1,51 @@
from __future__ import annotations
import xml.etree.ElementTree as ET
from email.utils import parsedate_to_datetime
from typing import Any, Callable
from ai_daily_report.models import SourceConfig
from ai_daily_report.normalize import clean_text
FetchText = Callable[[str, int], str]
def _parse_pubdate(value: str) -> str | None:
if not value:
return None
try:
return parsedate_to_datetime(value).isoformat()
except Exception:
return None
def parse_rss_items(config: SourceConfig, xml_text: str, *, limit: int = 20) -> list[dict[str, Any]]:
root = ET.fromstring(xml_text)
channel = root.find("channel")
raw_items = channel.findall("item") if channel is not None else []
items: list[dict[str, Any]] = []
for raw in raw_items[:limit]:
title = clean_text(raw.findtext("title") or "")
if not title:
continue
summary = clean_text(raw.findtext("description") or "")
items.append(
{
"source_group": config.name,
"source_label": config.name,
"title_raw": title,
"summary_raw": summary,
"url": (raw.findtext("link") or "").strip(),
"published_at": _parse_pubdate(raw.findtext("pubDate") or ""),
"origin_type": "rss",
"section_hint": "",
"language_hint": "en" if title.encode("utf-8").isascii() else "zh",
}
)
return items
def fetch_rss(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
return parse_rss_items(config, fetch_text(config.url, config.timeout_seconds))