Refactor AI daily report pipeline

2026-06-04 15:21:56 +08:00
parent 94e18ce22d
commit 5a98696255
64 changed files with 4778 additions and 1316 deletions
--- a/ai_daily_report/sources/init.py
+++ b/ai_daily_report/sources/init.py
@@ -0,0 +1,2 @@
+"""Source adapters for the AI daily report pipeline."""
+
--- a/ai_daily_report/sources/aihot.py
+++ b/ai_daily_report/sources/aihot.py
@@ -0,0 +1,32 @@
+from __future__ import annotations
+
+import json
+from typing import Any, Callable
+
+from ai_daily_report.models import SourceConfig
+
+
+FetchText = Callable[[str, int], str]
+
+
+def fetch_aihot(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
+    data = json.loads(fetch_text(f"https://aihot.virxact.com/api/public/daily/{run_date}", config.timeout_seconds))
+    items: list[dict[str, Any]] = []
+    generated = data.get("generatedAt")
+    for section in data.get("sections", []) or []:
+        for raw in section.get("items", []) or []:
+            items.append(
+                {
+                    "source_group": config.name,
+                    "source_label": raw.get("sourceName") or config.name,
+                    "title_raw": raw.get("title") or "",
+                    "summary_raw": raw.get("summary") or "",
+                    "url": raw.get("sourceUrl") or "",
+                    "published_at": generated,
+                    "origin_type": "aihot_json",
+                    "section_hint": section.get("label") or "",
+                    "language_hint": "zh",
+                }
+            )
+    return items
+
--- a/ai_daily_report/sources/juya.py
+++ b/ai_daily_report/sources/juya.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+import re
+import xml.etree.ElementTree as ET
+from typing import Any, Callable
+
+from ai_daily_report.models import SourceConfig
+from ai_daily_report.normalize import clean_text
+from ai_daily_report.sources.labels import source_label_from_url
+
+
+FetchText = Callable[[str, int], str]
+
+
+def parse_juya_rss(config: SourceConfig, xml_text: str, run_date: str) -> list[dict[str, Any]]:
+    root = ET.fromstring(xml_text)
+    channel = root.find("channel")
+    raw_items = channel.findall("item") if channel is not None else []
+    article_html = ""
+    for raw in raw_items:
+        if (raw.findtext("title") or "").strip() != run_date:
+            continue
+        content_el = raw.find("{http://purl.org/rss/1.0/modules/content/}encoded")
+        article_html = content_el.text if content_el is not None and content_el.text else ""
+        break
+    if not article_html:
+        return []
+
+    block_pattern = re.compile(
+        r'<h2[^>]*>\s*(?:<a[^>]*href="(?P<title_url>[^"]+)"[^>]*>)?(?P<title_html>[^<]*?)</a>?\s*<code>#(?P<num>\d+)</code>\s*</h2>(?P<body>.*?)(?=<hr\s*/?>\s*<h2|<p><strong>提示</strong>|$)',
+        re.S | re.I,
+    )
+    items: list[dict[str, Any]] = []
+    for match in block_pattern.finditer(article_html):
+        title = clean_text(match.group("title_html") or "")
+        body_html = match.group("body") or ""
+        links = re.findall(r'<a[^>]*href="([^"]+)"[^>]*>', body_html, re.I)
+        url = links[0].replace("&amp;", "&").strip() if links else (match.group("title_url") or "")
+        summary = clean_text(re.sub(r"<[^>]+>", " ", body_html))
+        if title:
+            items.append(
+                {
+                    "source_group": config.name,
+                    "source_label": source_label_from_url(url, fallback=config.name),
+                    "title_raw": title,
+                    "summary_raw": summary[:500],
+                    "url": url,
+                    "published_at": None,
+                    "origin_type": "juya_issue",
+                    "section_hint": "",
+                    "language_hint": "zh",
+                }
+            )
+    return items
+
+
+def fetch_juya(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
+    return parse_juya_rss(config, fetch_text(config.url, config.timeout_seconds), run_date)
--- a/ai_daily_report/sources/labels.py
+++ b/ai_daily_report/sources/labels.py
@@ -0,0 +1,78 @@
+from __future__ import annotations
+
+from urllib.parse import urlparse
+
+
+DOMAIN_LABELS = {
+    "anthropic.com": "Anthropic",
+    "arxiv.org": "arXiv",
+    "bloomberg.com": "Bloomberg",
+    "deepseek.com": "DeepSeek",
+    "github.blog": "GitHub Blog",
+    "github.com": "GitHub",
+    "huggingface.co": "Hugging Face",
+    "infoq.com": "InfoQ",
+    "mp.weixin.qq.com": "微信公众号",
+    "openai.com": "OpenAI",
+    "platform.minimaxi.com": "MiniMax：Docs",
+    "qbitai.com": "量子位",
+    "techcrunch.com": "TechCrunch",
+    "technologyreview.com": "MIT科技评论AI",
+    "theverge.com": "The Verge",
+    "x.com": "X",
+    "twitter.com": "X",
+}
+
+X_DISPLAY_NAMES = {
+    "MiniMax_AI": "MiniMax",
+    "OpenAIDevs": "OpenAI Developers",
+    "openai": "OpenAI",
+    "openclaw": "OpenClaw",
+    "xai": "xAI",
+    "krea_ai": "Krea AI",
+    "nvidia": "NVIDIA",
+    "NVIDIAAI": "NVIDIA AI",
+    "alibaba_cloud": "阿里云 / Alibaba Cloud",
+    "cb_doge": "cb_doge",
+}
+
+
+def _host(url: str) -> str:
+    host = (urlparse(url).netloc or "").lower()
+    return host[4:] if host.startswith("www.") else host
+
+
+def _domain_label(host: str) -> str:
+    for domain, label in DOMAIN_LABELS.items():
+        if host == domain or host.endswith("." + domain):
+            return label
+    return host
+
+
+def _x_handle(url: str) -> str:
+    parts = [part for part in urlparse(url).path.split("/") if part]
+    if not parts:
+        return ""
+    handle = parts[0]
+    if handle in {"i", "search", "explore", "settings", "notifications", "home", "compose"}:
+        return ""
+    return handle
+
+
+def source_label_from_url(url: str, *, fallback: str = "来源") -> str:
+    if not url:
+        return fallback
+    host = _host(url)
+    if host in {"x.com", "twitter.com"}:
+        handle = _x_handle(url)
+        if handle:
+            display = X_DISPLAY_NAMES.get(handle, handle)
+            return f"X：{display} (@{handle})"
+        return "X"
+
+    label = _domain_label(host)
+    parsed = urlparse(url)
+    path = (parsed.path or "").lower()
+    if label and ("blog" in host or "/blog" in path or "/research" in path):
+        return f"{label}：Blog"
+    return label or fallback
--- a/ai_daily_report/sources/registry.py
+++ b/ai_daily_report/sources/registry.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+from typing import Callable
+
+from ai_daily_report.models import SourceConfig
+from ai_daily_report.sources.aihot import fetch_aihot
+from ai_daily_report.sources.juya import fetch_juya
+from ai_daily_report.sources.rss import fetch_rss
+
+
+SourceFetcher = Callable[[SourceConfig, str, Callable[[str, int], str]], list[dict]]
+
+SOURCE_FETCHERS: dict[str, SourceFetcher] = {
+    "aihot": fetch_aihot,
+    "rss": fetch_rss,
+    "juya_rss": fetch_juya,
+}
+
+
+def get_source_fetcher(source_type: str) -> SourceFetcher:
+    if source_type not in SOURCE_FETCHERS:
+        raise KeyError(f"Unknown source type: {source_type}")
+    return SOURCE_FETCHERS[source_type]
+
--- a/ai_daily_report/sources/rss.py
+++ b/ai_daily_report/sources/rss.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+import xml.etree.ElementTree as ET
+from email.utils import parsedate_to_datetime
+from typing import Any, Callable
+
+from ai_daily_report.models import SourceConfig
+from ai_daily_report.normalize import clean_text
+
+
+FetchText = Callable[[str, int], str]
+
+
+def _parse_pubdate(value: str) -> str | None:
+    if not value:
+        return None
+    try:
+        return parsedate_to_datetime(value).isoformat()
+    except Exception:
+        return None
+
+
+def parse_rss_items(config: SourceConfig, xml_text: str, *, limit: int = 20) -> list[dict[str, Any]]:
+    root = ET.fromstring(xml_text)
+    channel = root.find("channel")
+    raw_items = channel.findall("item") if channel is not None else []
+    items: list[dict[str, Any]] = []
+    for raw in raw_items[:limit]:
+        title = clean_text(raw.findtext("title") or "")
+        if not title:
+            continue
+        summary = clean_text(raw.findtext("description") or "")
+        items.append(
+            {
+                "source_group": config.name,
+                "source_label": config.name,
+                "title_raw": title,
+                "summary_raw": summary,
+                "url": (raw.findtext("link") or "").strip(),
+                "published_at": _parse_pubdate(raw.findtext("pubDate") or ""),
+                "origin_type": "rss",
+                "section_hint": "",
+                "language_hint": "en" if title.encode("utf-8").isascii() else "zh",
+            }
+        )
+    return items
+
+
+def fetch_rss(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
+    return parse_rss_items(config, fetch_text(config.url, config.timeout_seconds))
+
				`@@ -0,0 +1,2 @@`
				`"""Source adapters for the AI daily report pipeline."""`