Refactor AI daily report pipeline

This commit is contained in:
Mimikko-zeus
2026-06-04 15:21:56 +08:00
parent 94e18ce22d
commit 5a98696255
64 changed files with 4778 additions and 1316 deletions

View File

@@ -0,0 +1,78 @@
from __future__ import annotations
from urllib.parse import urlparse
DOMAIN_LABELS = {
"anthropic.com": "Anthropic",
"arxiv.org": "arXiv",
"bloomberg.com": "Bloomberg",
"deepseek.com": "DeepSeek",
"github.blog": "GitHub Blog",
"github.com": "GitHub",
"huggingface.co": "Hugging Face",
"infoq.com": "InfoQ",
"mp.weixin.qq.com": "微信公众号",
"openai.com": "OpenAI",
"platform.minimaxi.com": "MiniMaxDocs",
"qbitai.com": "量子位",
"techcrunch.com": "TechCrunch",
"technologyreview.com": "MIT科技评论AI",
"theverge.com": "The Verge",
"x.com": "X",
"twitter.com": "X",
}
X_DISPLAY_NAMES = {
"MiniMax_AI": "MiniMax",
"OpenAIDevs": "OpenAI Developers",
"openai": "OpenAI",
"openclaw": "OpenClaw",
"xai": "xAI",
"krea_ai": "Krea AI",
"nvidia": "NVIDIA",
"NVIDIAAI": "NVIDIA AI",
"alibaba_cloud": "阿里云 / Alibaba Cloud",
"cb_doge": "cb_doge",
}
def _host(url: str) -> str:
host = (urlparse(url).netloc or "").lower()
return host[4:] if host.startswith("www.") else host
def _domain_label(host: str) -> str:
for domain, label in DOMAIN_LABELS.items():
if host == domain or host.endswith("." + domain):
return label
return host
def _x_handle(url: str) -> str:
parts = [part for part in urlparse(url).path.split("/") if part]
if not parts:
return ""
handle = parts[0]
if handle in {"i", "search", "explore", "settings", "notifications", "home", "compose"}:
return ""
return handle
def source_label_from_url(url: str, *, fallback: str = "来源") -> str:
if not url:
return fallback
host = _host(url)
if host in {"x.com", "twitter.com"}:
handle = _x_handle(url)
if handle:
display = X_DISPLAY_NAMES.get(handle, handle)
return f"X{display} (@{handle})"
return "X"
label = _domain_label(host)
parsed = urlparse(url)
path = (parsed.path or "").lower()
if label and ("blog" in host or "/blog" in path or "/research" in path):
return f"{label}Blog"
return label or fallback