Refactor AI daily report pipeline
This commit is contained in:
2
ai_daily_report/sources/__init__.py
Normal file
2
ai_daily_report/sources/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
"""Source adapters for the AI daily report pipeline."""
|
||||
|
||||
32
ai_daily_report/sources/aihot.py
Normal file
32
ai_daily_report/sources/aihot.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any, Callable
|
||||
|
||||
from ai_daily_report.models import SourceConfig
|
||||
|
||||
|
||||
FetchText = Callable[[str, int], str]
|
||||
|
||||
|
||||
def fetch_aihot(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
|
||||
data = json.loads(fetch_text(f"https://aihot.virxact.com/api/public/daily/{run_date}", config.timeout_seconds))
|
||||
items: list[dict[str, Any]] = []
|
||||
generated = data.get("generatedAt")
|
||||
for section in data.get("sections", []) or []:
|
||||
for raw in section.get("items", []) or []:
|
||||
items.append(
|
||||
{
|
||||
"source_group": config.name,
|
||||
"source_label": raw.get("sourceName") or config.name,
|
||||
"title_raw": raw.get("title") or "",
|
||||
"summary_raw": raw.get("summary") or "",
|
||||
"url": raw.get("sourceUrl") or "",
|
||||
"published_at": generated,
|
||||
"origin_type": "aihot_json",
|
||||
"section_hint": section.get("label") or "",
|
||||
"language_hint": "zh",
|
||||
}
|
||||
)
|
||||
return items
|
||||
|
||||
58
ai_daily_report/sources/juya.py
Normal file
58
ai_daily_report/sources/juya.py
Normal file
@@ -0,0 +1,58 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
from typing import Any, Callable
|
||||
|
||||
from ai_daily_report.models import SourceConfig
|
||||
from ai_daily_report.normalize import clean_text
|
||||
from ai_daily_report.sources.labels import source_label_from_url
|
||||
|
||||
|
||||
FetchText = Callable[[str, int], str]
|
||||
|
||||
|
||||
def parse_juya_rss(config: SourceConfig, xml_text: str, run_date: str) -> list[dict[str, Any]]:
|
||||
root = ET.fromstring(xml_text)
|
||||
channel = root.find("channel")
|
||||
raw_items = channel.findall("item") if channel is not None else []
|
||||
article_html = ""
|
||||
for raw in raw_items:
|
||||
if (raw.findtext("title") or "").strip() != run_date:
|
||||
continue
|
||||
content_el = raw.find("{http://purl.org/rss/1.0/modules/content/}encoded")
|
||||
article_html = content_el.text if content_el is not None and content_el.text else ""
|
||||
break
|
||||
if not article_html:
|
||||
return []
|
||||
|
||||
block_pattern = re.compile(
|
||||
r'<h2[^>]*>\s*(?:<a[^>]*href="(?P<title_url>[^"]+)"[^>]*>)?(?P<title_html>[^<]*?)</a>?\s*<code>#(?P<num>\d+)</code>\s*</h2>(?P<body>.*?)(?=<hr\s*/?>\s*<h2|<p><strong>提示</strong>|$)',
|
||||
re.S | re.I,
|
||||
)
|
||||
items: list[dict[str, Any]] = []
|
||||
for match in block_pattern.finditer(article_html):
|
||||
title = clean_text(match.group("title_html") or "")
|
||||
body_html = match.group("body") or ""
|
||||
links = re.findall(r'<a[^>]*href="([^"]+)"[^>]*>', body_html, re.I)
|
||||
url = links[0].replace("&", "&").strip() if links else (match.group("title_url") or "")
|
||||
summary = clean_text(re.sub(r"<[^>]+>", " ", body_html))
|
||||
if title:
|
||||
items.append(
|
||||
{
|
||||
"source_group": config.name,
|
||||
"source_label": source_label_from_url(url, fallback=config.name),
|
||||
"title_raw": title,
|
||||
"summary_raw": summary[:500],
|
||||
"url": url,
|
||||
"published_at": None,
|
||||
"origin_type": "juya_issue",
|
||||
"section_hint": "",
|
||||
"language_hint": "zh",
|
||||
}
|
||||
)
|
||||
return items
|
||||
|
||||
|
||||
def fetch_juya(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
|
||||
return parse_juya_rss(config, fetch_text(config.url, config.timeout_seconds), run_date)
|
||||
78
ai_daily_report/sources/labels.py
Normal file
78
ai_daily_report/sources/labels.py
Normal file
@@ -0,0 +1,78 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
DOMAIN_LABELS = {
|
||||
"anthropic.com": "Anthropic",
|
||||
"arxiv.org": "arXiv",
|
||||
"bloomberg.com": "Bloomberg",
|
||||
"deepseek.com": "DeepSeek",
|
||||
"github.blog": "GitHub Blog",
|
||||
"github.com": "GitHub",
|
||||
"huggingface.co": "Hugging Face",
|
||||
"infoq.com": "InfoQ",
|
||||
"mp.weixin.qq.com": "微信公众号",
|
||||
"openai.com": "OpenAI",
|
||||
"platform.minimaxi.com": "MiniMax:Docs",
|
||||
"qbitai.com": "量子位",
|
||||
"techcrunch.com": "TechCrunch",
|
||||
"technologyreview.com": "MIT科技评论AI",
|
||||
"theverge.com": "The Verge",
|
||||
"x.com": "X",
|
||||
"twitter.com": "X",
|
||||
}
|
||||
|
||||
X_DISPLAY_NAMES = {
|
||||
"MiniMax_AI": "MiniMax",
|
||||
"OpenAIDevs": "OpenAI Developers",
|
||||
"openai": "OpenAI",
|
||||
"openclaw": "OpenClaw",
|
||||
"xai": "xAI",
|
||||
"krea_ai": "Krea AI",
|
||||
"nvidia": "NVIDIA",
|
||||
"NVIDIAAI": "NVIDIA AI",
|
||||
"alibaba_cloud": "阿里云 / Alibaba Cloud",
|
||||
"cb_doge": "cb_doge",
|
||||
}
|
||||
|
||||
|
||||
def _host(url: str) -> str:
|
||||
host = (urlparse(url).netloc or "").lower()
|
||||
return host[4:] if host.startswith("www.") else host
|
||||
|
||||
|
||||
def _domain_label(host: str) -> str:
|
||||
for domain, label in DOMAIN_LABELS.items():
|
||||
if host == domain or host.endswith("." + domain):
|
||||
return label
|
||||
return host
|
||||
|
||||
|
||||
def _x_handle(url: str) -> str:
|
||||
parts = [part for part in urlparse(url).path.split("/") if part]
|
||||
if not parts:
|
||||
return ""
|
||||
handle = parts[0]
|
||||
if handle in {"i", "search", "explore", "settings", "notifications", "home", "compose"}:
|
||||
return ""
|
||||
return handle
|
||||
|
||||
|
||||
def source_label_from_url(url: str, *, fallback: str = "来源") -> str:
|
||||
if not url:
|
||||
return fallback
|
||||
host = _host(url)
|
||||
if host in {"x.com", "twitter.com"}:
|
||||
handle = _x_handle(url)
|
||||
if handle:
|
||||
display = X_DISPLAY_NAMES.get(handle, handle)
|
||||
return f"X:{display} (@{handle})"
|
||||
return "X"
|
||||
|
||||
label = _domain_label(host)
|
||||
parsed = urlparse(url)
|
||||
path = (parsed.path or "").lower()
|
||||
if label and ("blog" in host or "/blog" in path or "/research" in path):
|
||||
return f"{label}:Blog"
|
||||
return label or fallback
|
||||
24
ai_daily_report/sources/registry.py
Normal file
24
ai_daily_report/sources/registry.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Callable
|
||||
|
||||
from ai_daily_report.models import SourceConfig
|
||||
from ai_daily_report.sources.aihot import fetch_aihot
|
||||
from ai_daily_report.sources.juya import fetch_juya
|
||||
from ai_daily_report.sources.rss import fetch_rss
|
||||
|
||||
|
||||
SourceFetcher = Callable[[SourceConfig, str, Callable[[str, int], str]], list[dict]]
|
||||
|
||||
SOURCE_FETCHERS: dict[str, SourceFetcher] = {
|
||||
"aihot": fetch_aihot,
|
||||
"rss": fetch_rss,
|
||||
"juya_rss": fetch_juya,
|
||||
}
|
||||
|
||||
|
||||
def get_source_fetcher(source_type: str) -> SourceFetcher:
|
||||
if source_type not in SOURCE_FETCHERS:
|
||||
raise KeyError(f"Unknown source type: {source_type}")
|
||||
return SOURCE_FETCHERS[source_type]
|
||||
|
||||
51
ai_daily_report/sources/rss.py
Normal file
51
ai_daily_report/sources/rss.py
Normal file
@@ -0,0 +1,51 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import xml.etree.ElementTree as ET
|
||||
from email.utils import parsedate_to_datetime
|
||||
from typing import Any, Callable
|
||||
|
||||
from ai_daily_report.models import SourceConfig
|
||||
from ai_daily_report.normalize import clean_text
|
||||
|
||||
|
||||
FetchText = Callable[[str, int], str]
|
||||
|
||||
|
||||
def _parse_pubdate(value: str) -> str | None:
|
||||
if not value:
|
||||
return None
|
||||
try:
|
||||
return parsedate_to_datetime(value).isoformat()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def parse_rss_items(config: SourceConfig, xml_text: str, *, limit: int = 20) -> list[dict[str, Any]]:
|
||||
root = ET.fromstring(xml_text)
|
||||
channel = root.find("channel")
|
||||
raw_items = channel.findall("item") if channel is not None else []
|
||||
items: list[dict[str, Any]] = []
|
||||
for raw in raw_items[:limit]:
|
||||
title = clean_text(raw.findtext("title") or "")
|
||||
if not title:
|
||||
continue
|
||||
summary = clean_text(raw.findtext("description") or "")
|
||||
items.append(
|
||||
{
|
||||
"source_group": config.name,
|
||||
"source_label": config.name,
|
||||
"title_raw": title,
|
||||
"summary_raw": summary,
|
||||
"url": (raw.findtext("link") or "").strip(),
|
||||
"published_at": _parse_pubdate(raw.findtext("pubDate") or ""),
|
||||
"origin_type": "rss",
|
||||
"section_hint": "",
|
||||
"language_hint": "en" if title.encode("utf-8").isascii() else "zh",
|
||||
}
|
||||
)
|
||||
return items
|
||||
|
||||
|
||||
def fetch_rss(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
|
||||
return parse_rss_items(config, fetch_text(config.url, config.timeout_seconds))
|
||||
|
||||
Reference in New Issue
Block a user