Refactor AI daily report pipeline
This commit is contained in:
78
ai_daily_report/sources/labels.py
Normal file
78
ai_daily_report/sources/labels.py
Normal file
@@ -0,0 +1,78 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
DOMAIN_LABELS = {
|
||||
"anthropic.com": "Anthropic",
|
||||
"arxiv.org": "arXiv",
|
||||
"bloomberg.com": "Bloomberg",
|
||||
"deepseek.com": "DeepSeek",
|
||||
"github.blog": "GitHub Blog",
|
||||
"github.com": "GitHub",
|
||||
"huggingface.co": "Hugging Face",
|
||||
"infoq.com": "InfoQ",
|
||||
"mp.weixin.qq.com": "微信公众号",
|
||||
"openai.com": "OpenAI",
|
||||
"platform.minimaxi.com": "MiniMax:Docs",
|
||||
"qbitai.com": "量子位",
|
||||
"techcrunch.com": "TechCrunch",
|
||||
"technologyreview.com": "MIT科技评论AI",
|
||||
"theverge.com": "The Verge",
|
||||
"x.com": "X",
|
||||
"twitter.com": "X",
|
||||
}
|
||||
|
||||
X_DISPLAY_NAMES = {
|
||||
"MiniMax_AI": "MiniMax",
|
||||
"OpenAIDevs": "OpenAI Developers",
|
||||
"openai": "OpenAI",
|
||||
"openclaw": "OpenClaw",
|
||||
"xai": "xAI",
|
||||
"krea_ai": "Krea AI",
|
||||
"nvidia": "NVIDIA",
|
||||
"NVIDIAAI": "NVIDIA AI",
|
||||
"alibaba_cloud": "阿里云 / Alibaba Cloud",
|
||||
"cb_doge": "cb_doge",
|
||||
}
|
||||
|
||||
|
||||
def _host(url: str) -> str:
|
||||
host = (urlparse(url).netloc or "").lower()
|
||||
return host[4:] if host.startswith("www.") else host
|
||||
|
||||
|
||||
def _domain_label(host: str) -> str:
|
||||
for domain, label in DOMAIN_LABELS.items():
|
||||
if host == domain or host.endswith("." + domain):
|
||||
return label
|
||||
return host
|
||||
|
||||
|
||||
def _x_handle(url: str) -> str:
|
||||
parts = [part for part in urlparse(url).path.split("/") if part]
|
||||
if not parts:
|
||||
return ""
|
||||
handle = parts[0]
|
||||
if handle in {"i", "search", "explore", "settings", "notifications", "home", "compose"}:
|
||||
return ""
|
||||
return handle
|
||||
|
||||
|
||||
def source_label_from_url(url: str, *, fallback: str = "来源") -> str:
|
||||
if not url:
|
||||
return fallback
|
||||
host = _host(url)
|
||||
if host in {"x.com", "twitter.com"}:
|
||||
handle = _x_handle(url)
|
||||
if handle:
|
||||
display = X_DISPLAY_NAMES.get(handle, handle)
|
||||
return f"X:{display} (@{handle})"
|
||||
return "X"
|
||||
|
||||
label = _domain_label(host)
|
||||
parsed = urlparse(url)
|
||||
path = (parsed.path or "").lower()
|
||||
if label and ("blog" in host or "/blog" in path or "/research" in path):
|
||||
return f"{label}:Blog"
|
||||
return label or fallback
|
||||
Reference in New Issue
Block a user