From 94e18ce22daf8eb6fbd7d02acec7640c94bc9a3b Mon Sep 17 00:00:00 2001
From: Elaina
Date: Thu, 4 Jun 2026 10:38:44 +0800
Subject: [PATCH] =?UTF-8?q?init:=20AI=E6=97=A5=E6=8A=A5=20pipeline=20?=
=?UTF-8?q?=E5=AE=8C=E6=95=B4=E4=BB=A3=E7=A0=81=20+=20=E6=8A=80=E8=83=BD?=
=?UTF-8?q?=E6=96=87=E6=A1=A3=20+=20=E8=BF=90=E8=A1=8C=E8=AE=B0=E5=BD=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
README.md | 67 ++
cron/config.json | 14 +
script/ai_daily_blog_pipeline.py | 1104 ++++++++++++++++++++
script/blog_markdown.md | 198 ++++
script/run_meta.json | 35 +
skill/SKILL.md | 127 +++
skill/references/llm-config-auto-follow.md | 29 +
skill/references/mimo-api-performance.md | 55 +
skill/references/rendering-guide.md | 65 ++
skill/references/timeout-config.md | 34 +
10 files changed, 1728 insertions(+)
create mode 100644 README.md
create mode 100644 cron/config.json
create mode 100644 script/ai_daily_blog_pipeline.py
create mode 100644 script/blog_markdown.md
create mode 100644 script/run_meta.json
create mode 100644 skill/SKILL.md
create mode 100644 skill/references/llm-config-auto-follow.md
create mode 100644 skill/references/mimo-api-performance.md
create mode 100644 skill/references/rendering-guide.md
create mode 100644 skill/references/timeout-config.md
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..02bd2a8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,67 @@
+# AI 日报 Pipeline
+
+每日自动抓取 AI 行业动态,去重→归类→改写→发布到 [blog.ephron.ren](https://blog.ephron.ren)。
+
+## 仓库结构
+
+```
+ai-daily-report/
+├── README.md ← 本文件
+├── script/
+│ ├── ai_daily_blog_pipeline.py ← 主脚本(~1100 行,纯 Python)
+│ ├── run_meta.json ← 最近一次运行元数据
+│ └── blog_markdown.md ← 最近一次发布的博文
+├── skill/
+│ ├── SKILL.md ← Hermes Agent 技能文档
+│ └── references/
+│ ├── llm-config-auto-follow.md ← LLM 配置自动跟随机制
+│ ├── mimo-api-performance.md ← MiMo API 性能基准测试
+│ ├── rendering-guide.md ← 博文渲染 & 导览格式
+│ └── timeout-config.md ← 超时配置参考
+└── cron/
+ └── config.json ← Cron 作业设置
+```
+
+## Pipeline 流程(4 阶段)
+
+| 阶段 | 做什么 | 是否调 LLM |
+|------|--------|-----------|
+| Stage 0 | 脚本去重(difflib,纯 Python) | ❌ |
+| Stage 1 | LLM 语义去重 | ✅ 1 次调用 |
+| Stage 2 | 并行改写摘要 + 分类 | ✅ 2 次并发调用 |
+| Stage 3 | LLM 生成今日导览 | ✅ 1 次调用 |
+| Stage 4 | 组装 + 发布到博客 | ❌ |
+
+## 数据源
+
+| 来源 | 类型 | 备注 |
+|------|------|------|
+| AI HOT | API | 主要来源,分类齐全 |
+| 橘鸦 AI 早报 | RSS(content:encoded) | 每日 ~09:34 发布 |
+| InfoQ AI | RSS | 英文技术管理 |
+| 量子位 | RSS | 中文 AI 新闻 |
+| MIT 科技评论 AI | RSS | 英文前沿报道 |
+
+## 关键配置
+
+- **Cron**: 每天 10:00 CST 执行,`no_agent` 模式
+- **超时**: 脚本 600s,LLM 600s,RSS 25s,橘鸦 45s
+- **LLM**: 自动跟随 Hermes 主模型配置
+- **去重**: 只去重不精选,保留全部非重复条目
+- **风格**: 无 emoji、无参考编号、无建议/评论、大白话
+
+## 上次运行
+
+来自 `run_meta.json`:
+- 日期:2026-06-04
+- 原始条目:39 | 去重后:38
+- AI HOT:32 | InfoQ:2 | 量子位:5 | 橘鸦:0(超时)
+- 已发布:https://blog.ephron.ren/posts/ai-2026-06-04
+
+## 2026-06-04 修复
+
+修复了橘鸦源长期不工作的问题:
+1. 解析从 RSS `content:encoded` 获取内容,消除第二次 HTTP 请求
+2. 修复正则 `\\s*` → `\s*`(白字符类错误)
+3. 修复 `.*?` → `[^<]*?`(防止概览节渗漏)
+4. 橘鸦超时从 25s 提升至 45s
diff --git a/cron/config.json b/cron/config.json
new file mode 100644
index 0000000..fca69c1
--- /dev/null
+++ b/cron/config.json
@@ -0,0 +1,14 @@
+{
+ "job_id": "76297415d88d",
+ "name": "AI日报单任务09:30",
+ "schedule": "0 10 * * *",
+ "mode": "no_agent",
+ "script": "ai_daily_blog_pipeline.py",
+ "deliver": "origin",
+ "enabled_toolsets": [
+ "terminal"
+ ],
+ "timeout_seconds": 600,
+ "last_run": "2026-06-04T10:04:02+08:00",
+ "last_status": "ok (with degraded sources)"
+}
\ No newline at end of file
diff --git a/script/ai_daily_blog_pipeline.py b/script/ai_daily_blog_pipeline.py
new file mode 100644
index 0000000..2700366
--- /dev/null
+++ b/script/ai_daily_blog_pipeline.py
@@ -0,0 +1,1104 @@
+#!/usr/bin/env python3
+import difflib
+import json
+import os
+import re
+import sys
+import time
+import urllib.request
+import urllib.error
+import xml.etree.ElementTree as ET
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime, timedelta, timezone
+from email.utils import parsedate_to_datetime
+from pathlib import Path
+from urllib.parse import urlparse
+
+
+UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
+CST = timezone(timedelta(hours=8))
+NOW = datetime.now(CST)
+TODAY = NOW.date().isoformat()
+SINCE = NOW - timedelta(hours=30)
+SCRIPT_DIR = Path.home() / '.hermes' / 'scripts'
+OUT_DIR = SCRIPT_DIR / 'ai_morning_out'
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+
+RSS_FEEDS = {
+ 'InfoQ AI': 'https://feed.infoq.com/ai-ml-data-eng/',
+ 'MIT科技评论AI': 'https://www.technologyreview.com/topic/artificial-intelligence/feed',
+ '量子位': 'https://www.qbitai.com/feed',
+}
+JUYA_RSS = 'https://imjuya.github.io/juya-ai-daily/rss.xml'
+SECTION_ORDER = ['模型发布/更新', '产品与工具', '开发与工程', '行业与公司', '论文与研究', '人物与花絮', '观点与教程']
+
+
+# ─── Data collection (unchanged) ────────────────────────────────────────────
+
+def fetch_text(url: str) -> str:
+ req = urllib.request.Request(url, headers={'User-Agent': UA})
+ with urllib.request.urlopen(req, timeout=25) as r:
+ return r.read().decode('utf-8', 'ignore')
+
+
+def parse_pubdate(text: str):
+ if not text:
+ return None
+ try:
+ dt = parsedate_to_datetime(text)
+ if dt.tzinfo is None:
+ dt = dt.replace(tzinfo=timezone.utc)
+ return dt.astimezone(CST)
+ except Exception:
+ return None
+
+
+def clean_text(s: str) -> str:
+ s = re.sub(r'<[^>]+>', ' ', s or '')
+ s = s.replace(' ', ' ').replace('&', '&')
+ s = re.sub(r'\s+', ' ', s).strip()
+ return s
+
+
+def source_name_from_url(url: str, fallback: str = '来源') -> str:
+ if not url:
+ return fallback
+ host = (urlparse(url).netloc or '').lower()
+ if host.startswith('www.'):
+ host = host[4:]
+ mapping = {
+ 'x.com': 'X', 'twitter.com': 'X', 'github.com': 'GitHub', 'github.blog': 'GitHub Blog',
+ 'openrouter.ai': 'OpenRouter', 'anthropic.com': 'Anthropic', 'cursor.com': 'Cursor',
+ 'technologyreview.com': 'MIT科技评论AI', 'the-decoder.com': 'The Decoder', 'xiaohongshu.com': '小红书',
+ 'mp.weixin.qq.com': '微信文章', 'qbitai.com': '量子位', 'ithome.com': 'IT之家', 'browse.sh': 'Browse.sh',
+ 'huggingface.co': 'Hugging Face', 'openai.com': 'OpenAI', 'claude.com': 'Claude',
+ 'theverge.com': 'The Verge', 'infoq.com': 'InfoQ', 'research.google': 'Google Research',
+ 'simonwillison.net': 'Simon Willison', 'runwayml.com': 'Runway', 'perplexity.ai': 'Perplexity',
+ 'venturebeat.com': 'VentureBeat', 'arxiv.org': 'arXiv', 'reuters.com': '路透社',
+ 'bloomberg.com': 'Bloomberg', 'techcrunch.com': 'TechCrunch', 'wired.com': 'Wired',
+ 'deepseek.com': 'DeepSeek', 'baidu.com': '百度', 'alibaba.com': '阿里',
+ }
+ for domain, name in mapping.items():
+ if host == domain or host.endswith('.' + domain):
+ return name
+ return host or fallback
+
+
+def x_username_from_url(url: str) -> str:
+ """Extract X/Twitter username from URL like https://x.com/OpenAIDevs/status/..."""
+ if not url:
+ return ''
+ host = (urlparse(url).netloc or '').lower()
+ if host.startswith('www.'):
+ host = host[4:]
+ if host not in ('x.com', 'twitter.com'):
+ return ''
+ parts = [p for p in urlparse(url).path.split('/') if p]
+ if len(parts) >= 1 and parts[0] not in ('i', 'search', 'explore', 'settings', 'notifications', 'home', 'compose'):
+ return parts[0]
+ return ''
+
+
+def smart_source_label(url: str, api_source_name: str = '') -> str:
+ """Generate a descriptive source label from URL, preferring specific names over generic API labels."""
+ x_user = x_username_from_url(url)
+ if x_user:
+ return f'X:@{x_user}'
+ url_name = source_name_from_url(url, '')
+ if url_name and url_name not in ('来源', ''):
+ host = (urlparse(url).netloc or '').lower()
+ path = (urlparse(url).path or '').lower()
+ if 'blog' in host or '/blog' in path or '/research' in path:
+ return f'{url_name}:Blog'
+ if '/index' in path or path.rstrip('/') in ('', '/about', '/products'):
+ return f'{url_name}:官网动态'
+ return url_name
+ if api_source_name and api_source_name not in ('AI HOT', '社交媒体/博客', '科技媒体', '公司官网', '公司博客', '社区/博客', '个人博客', '技术媒体'):
+ return api_source_name
+ return api_source_name or 'AI HOT'
+
+
+def parse_aihot(today: str):
+ url = f'https://aihot.virxact.com/api/public/daily/{today}'
+ data = json.loads(fetch_text(url))
+ items = []
+ generated = data.get('generatedAt')
+ for sec in data.get('sections', []):
+ for it in sec.get('items', []):
+ item_url = (it.get('sourceUrl') or '').strip()
+ api_src = clean_text(it.get('sourceName', '')) or ''
+ items.append({
+ 'source_group': 'AI HOT',
+ 'source_label': smart_source_label(item_url, api_src),
+ 'title_raw': clean_text(it.get('title', '')),
+ 'summary_raw': clean_text(it.get('summary', '')),
+ 'url': item_url,
+ 'published_at': generated,
+ 'origin_type': 'aihot_json',
+ 'section_hint': sec.get('label') or '',
+ 'language_hint': 'zh',
+ })
+ for flash in data.get('flashes', []) or []:
+ flash_url = (flash.get('sourceUrl') or '').strip()
+ api_src = clean_text(flash.get('sourceName', '')) or ''
+ items.append({
+ 'source_group': 'AI HOT',
+ 'source_label': smart_source_label(flash_url, api_src),
+ 'title_raw': clean_text(flash.get('title', '')),
+ 'summary_raw': clean_text(flash.get('summary', '')),
+ 'url': flash_url,
+ 'published_at': generated,
+ 'origin_type': 'aihot_flash',
+ 'section_hint': '快讯',
+ 'language_hint': 'zh',
+ })
+ return items, data
+
+
+def parse_rss(name: str, url: str):
+ xml = fetch_text(url)
+ root = ET.fromstring(xml)
+ channel = root.find('channel')
+ items = channel.findall('item') if channel is not None else []
+ out = []
+ for it in items[:20]:
+ pub = parse_pubdate(it.findtext('pubDate') or '')
+ if pub and pub < SINCE:
+ continue
+ link = (it.findtext('link') or '').strip()
+ title = clean_text(it.findtext('title') or '')
+ summary = clean_text(it.findtext('description') or '')
+ if not title:
+ continue
+ out.append({
+ 'source_group': name,
+ 'source_label': name,
+ 'title_raw': title,
+ 'summary_raw': summary,
+ 'url': link,
+ 'published_at': pub.isoformat() if pub else None,
+ 'origin_type': 'rss',
+ 'section_hint': '',
+ 'language_hint': 'en' if len(re.findall(r'[A-Za-z]', title + ' ' + summary)) > len(re.findall(r'[\u4e00-\u9fff]', title + ' ' + summary)) else 'zh',
+ })
+ return out
+
+
+def fetch_juya_rss(today: str):
+ """Fetch 橘鸦 RSS and return (target_url, pub_date, html_content).
+ html_content is from content:encoded if available, else None.
+ Uses a longer timeout (45s) since GitHub Pages can be slow."""
+ req = urllib.request.Request(JUYA_RSS, headers={'User-Agent': UA})
+ with urllib.request.urlopen(req, timeout=45) as r:
+ xml = r.read().decode('utf-8', 'ignore')
+ root = ET.fromstring(xml)
+ channel = root.find('channel')
+ items = channel.findall('item') if channel is not None else []
+ target = None
+ pub = None
+ html_content = None
+ for it in items:
+ title = (it.findtext('title') or '').strip()
+ if title == today:
+ target = (it.findtext('link') or '').strip()
+ pub = parse_pubdate(it.findtext('pubDate') or '')
+ # Parse from RSS content:encoded to avoid a second HTTP request
+ ns = {'content': 'http://purl.org/rss/1.0/modules/content/'}
+ content_el = it.find('content:encoded', ns)
+ if content_el is not None and content_el.text:
+ html_content = content_el.text
+ break
+ return target, pub, html_content
+
+
+def parse_juya(today: str):
+ target, pub, html_content = fetch_juya_rss(today)
+ if not target:
+ return []
+
+ # Try RSS content:encoded first; fall back to fetching the article page
+ if html_content is None:
+ try:
+ req = urllib.request.Request(target, headers={'User-Agent': UA})
+ with urllib.request.urlopen(req, timeout=45) as r:
+ html = r.read().decode('utf-8', 'ignore')
+ except Exception:
+ return []
+ m = re.search(r']*>(.*?)', html, re.S | re.I)
+ if not m:
+ return []
+ article_html = m.group(1)
+ else:
+ article_html = html_content
+
+ block_pattern = re.compile(
+ r'(?P.*?)(?=
\s*
|||
', '\n', body_text, flags=re.I)
+ body_text = re.sub(r']*>', '', body_text, flags=re.I)
+ body_text = re.sub(r']+>.*?', ' ', body_text, flags=re.S | re.I)
+ body_text = re.sub(r'
]*>', ' ', body_text, flags=re.I)
+ body_text = re.sub(r'<[^>]+>', ' ', body_text)
+ lines = [clean_text(x) for x in body_text.split('\n') if clean_text(x)]
+ summary_lines = []
+ for line in lines:
+ if line.startswith('相关链接'):
+ break
+ if line == title:
+ continue
+ summary_lines.append(line)
+ summary = ' '.join(summary_lines[:4]).strip()
+ if not title:
+ continue
+ results.append({
+ 'source_group': '橘鸦AI早报',
+ 'source_label': source_name_from_url(url, '橘鸦AI早报') if url and 'imjuya.github.io/juya-ai-daily' not in url else '橘鸦AI早报',
+ 'title_raw': title,
+ 'summary_raw': summary,
+ 'url': url,
+ 'published_at': pub.isoformat() if pub else None,
+ 'origin_type': 'juya_issue',
+ 'section_hint': '',
+ 'language_hint': 'zh',
+ })
+ return results
+
+
+# ─── LLM infrastructure (unchanged) ─────────────────────────────────────────
+
+def load_env():
+ env = {}
+ env_path = Path.home() / '.hermes' / '.env'
+ if env_path.exists():
+ text = env_path.read_text(errors='ignore')
+ for line in text.splitlines():
+ if '=' in line and not line.strip().startswith('#'):
+ k, v = line.split('=', 1)
+ env[k.strip()] = v.strip()
+ env.update({k: v for k, v in os.environ.items() if v})
+ return env
+
+
+def resolve_llm_config(env: dict):
+ """Read Hermes config to get the active provider's API key, base_url, and model.
+
+ Priority:
+ 1) Explicit environment overrides for this pipeline (SUB2API / LLM_* / XIAOMI_* / XIAOMI_MIMO_*)
+ 2) Hermes model config (config.yaml)
+ 3) auth.json credential pool
+ 4) Legacy env fallbacks
+ """
+ import yaml
+
+ hermes_dir = Path.home() / '.hermes'
+
+ def first_env(*names: str) -> str:
+ for name in names:
+ val = (env.get(name) or '').strip()
+ if val:
+ return val
+ return ''
+
+ # Allow this script to be pinned to the current Hermes model config.
+ cfg_path = hermes_dir / 'config.yaml'
+ cfg = {}
+ if cfg_path.exists():
+ with open(cfg_path) as f:
+ cfg = yaml.safe_load(f) or {}
+
+ model_cfg = cfg.get('model', {}) or {}
+ provider = (model_cfg.get('provider') or '').strip()
+ base_url = (model_cfg.get('base_url') or '').rstrip('/')
+ model_name = (model_cfg.get('default') or '').strip()
+
+ # 1) Explicit overrides for this pipeline take precedence, but keep endpoint/key/model
+ # from the same provider family. Mixing SUB2API_API_KEY with XIAOMI_BASE_URL causes
+ # 401 after switching Hermes to a Sub2API model.
+ explicit_api_key = first_env('LLM_API_KEY')
+ explicit_base_url = first_env('LLM_BASE_URL')
+ explicit_model = first_env('LLM_MODEL')
+
+ if not explicit_api_key:
+ if provider == 'sub2api' or first_env('SUB2API_API_KEY', 'SUB2API_BASE_URL', 'SUB2API_MODEL'):
+ explicit_api_key = first_env('SUB2API_API_KEY')
+ explicit_base_url = first_env('SUB2API_BASE_URL') or base_url
+ explicit_model = first_env('SUB2API_MODEL') or model_name
+ elif first_env('XIAOMI_API_KEY', 'XIAOMI_MIMO_API_KEY', 'XIAOMI_BASE_URL', 'XIAOMI_MIMO_BASE_URL', 'XIAOMI_MODEL', 'XIAOMI_MIMO_MODEL'):
+ explicit_api_key = first_env('XIAOMI_API_KEY', 'XIAOMI_MIMO_API_KEY')
+ explicit_base_url = first_env('XIAOMI_BASE_URL', 'XIAOMI_MIMO_BASE_URL')
+ explicit_model = first_env('XIAOMI_MODEL', 'XIAOMI_MIMO_MODEL')
+
+ if explicit_base_url:
+ base_url = explicit_base_url.rstrip('/')
+ if explicit_model:
+ model_name = explicit_model
+
+ provider_def = (cfg.get('providers', {}) or {}).get(provider, {}) or {}
+ if not base_url and provider_def.get('base_url'):
+ base_url = str(provider_def.get('base_url')).rstrip('/')
+ if not explicit_api_key and provider_def.get('key_env'):
+ explicit_api_key = first_env(str(provider_def.get('key_env')))
+
+ # Fast fallback chain: if the active provider has no credentials, use a known-good
+ # provider/model from auth.json so the daily cron keeps publishing.
+ fallback_provider = first_env('LLM_FALLBACK_PROVIDER', 'XIAOMI_FALLBACK_PROVIDER') or 'openrouter'
+
+ api_key = explicit_api_key
+ auth_path = hermes_dir / 'auth.json'
+ if not api_key and auth_path.exists():
+ with open(auth_path) as f:
+ auth = json.load(f)
+ pool = auth.get('credential_pool', {}) or {}
+ provider_keys = []
+ if provider:
+ provider_keys.extend([provider, provider.replace('-', '_')])
+ # Known aliases for this environment.
+ provider_keys.extend(['sub2api', 'xiaomi', 'xiaomi_mimo', 'sensenova'])
+ for pkey in provider_keys:
+ creds = pool.get(pkey, [])
+ if creds:
+ cred = creds[0]
+ source = cred.get('source', '')
+ if source.startswith('env:'):
+ env_var = source[4:]
+ api_key = env.get(env_var, '') or api_key
+ if not api_key:
+ api_key = cred.get('access_token', '') or api_key
+ if not base_url:
+ base_url = (cred.get('base_url') or '').rstrip('/')
+ if not model_name:
+ model_name = cred.get('model', '') or model_name
+ break
+
+ # 3) Legacy env fallbacks.
+ if not api_key:
+ api_key = first_env('LLM_API_KEY', 'XIAOMI_API_KEY', 'XIAOMI_MIMO_API_KEY', 'OPENROUTER_API_KEY')
+ if not base_url:
+ base_url = first_env('LLM_BASE_URL', 'XIAOMI_BASE_URL', 'XIAOMI_MIMO_BASE_URL', 'OPENROUTER_BASE_URL').rstrip('/')
+ if not model_name:
+ model_name = first_env('LLM_MODEL') or 'mimo-v2.5-pro'
+
+ if not api_key and fallback_provider and auth_path.exists():
+ with open(auth_path) as f:
+ auth = json.load(f)
+ pool = auth.get('credential_pool', {}) or {}
+ for pkey in [fallback_provider, fallback_provider.replace('-', '_')]:
+ creds = pool.get(pkey, [])
+ if creds:
+ cred = creds[0]
+ source = cred.get('source', '')
+ if source.startswith('env:'):
+ env_var = source[4:]
+ api_key = env.get(env_var, '') or api_key
+ if not api_key:
+ api_key = cred.get('access_token', '') or api_key
+ if not base_url:
+ base_url = (cred.get('base_url') or '').rstrip('/')
+ if not model_name:
+ model_name = cred.get('model', '') or model_name
+ provider = fallback_provider
+ break
+
+ if not api_key:
+ raise RuntimeError(
+ f'No API key found for provider "{provider}" or fallback "{fallback_provider}". '
+ 'Set SUB2API_API_KEY / XIAOMI_API_KEY / LLM_API_KEY or fix ~/.hermes/auth.json'
+ )
+ if not base_url:
+ raise RuntimeError(
+ f'No base_url found for provider "{provider}" or fallback "{fallback_provider}". '
+ 'Set SUB2API_BASE_URL / XIAOMI_BASE_URL / LLM_BASE_URL or fix ~/.hermes/auth.json'
+ )
+
+ return api_key, base_url, model_name
+
+
+def _try_llm_request(base_url: str, api_key: str, model: str, prompt_text: str, auth_mode: str, api_key_header: str = 'Authorization'):
+ payload = json.dumps({
+ 'model': model,
+ 'messages': [{'role': 'user', 'content': prompt_text}],
+ 'temperature': 0.2,
+ 'max_tokens': 8000,
+ }, ensure_ascii=False).encode('utf-8')
+ headers = {'Content-Type': 'application/json'}
+ if api_key_header == 'Authorization':
+ headers[api_key_header] = f'Bearer {api_key}' if auth_mode == 'bearer' else api_key
+ else:
+ headers[api_key_header] = api_key
+ req = urllib.request.Request(f'{base_url}/chat/completions', data=payload, headers=headers)
+ with urllib.request.urlopen(req, timeout=600) as r:
+ resp = json.loads(r.read().decode('utf-8'))
+ return resp['choices'][0]['message']['content'].strip()
+
+
+def llm_call(prompt_text: str, env: dict) -> str:
+ api_key, base_url, model = resolve_llm_config(env)
+
+ # Use a single, explicit path so cron behavior is easy to debug.
+ # The earlier auth-matrix/fallback logic was making failures harder to reason about.
+ payload = json.dumps({
+ 'model': model,
+ 'messages': [{'role': 'user', 'content': prompt_text}],
+ 'temperature': 0.2,
+ 'max_tokens': 8000,
+ }, ensure_ascii=False).encode('utf-8')
+
+ req = urllib.request.Request(
+ f'{base_url}/chat/completions',
+ data=payload,
+ headers={'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'},
+ )
+ print(f'llm_call request: base_url={base_url}; model={model}', file=sys.stderr)
+ try:
+ with urllib.request.urlopen(req, timeout=600) as r:
+ resp = json.loads(r.read().decode('utf-8'))
+ return resp['choices'][0]['message']['content'].strip()
+ except urllib.error.HTTPError as e:
+ body = ''
+ try:
+ body = e.read().decode('utf-8', 'ignore')
+ except Exception:
+ pass
+ print(f'llm_call failed: HTTP {e.code} {e.reason}; base_url={base_url}; model={model}; body={body[:500]}', file=sys.stderr)
+ raise
+
+
+def _parse_json_from_llm(text: str):
+ """Strip markdown code blocks and extract a JSON object from LLM output."""
+ text = re.sub(r'^```(?:json)?\s*\n?', '', text)
+ text = re.sub(r'\n?```\s*$', '', text)
+ text = text.strip()
+ m = re.search(r'\{.*\}\s*$', text, re.S)
+ if not m:
+ raise ValueError('LLM 输出中未找到 JSON 对象')
+ raw_json = m.group(0)
+ raw_json = re.sub(r',\s*([}\]])', r'\1', raw_json)
+ return json.loads(raw_json)
+
+
+def _normalize_title(title: str) -> str:
+ """Normalize a title for dedup comparison: strip non-alphanumeric, lowercase."""
+ return re.sub(r'[^\w\u4e00-\u9fff]+', '', (title or '').lower())
+
+
+# ─── Stage 0: Script dedup (no LLM) ────────────────────────────────────────
+
+def stage0_script_dedup(raw_items: list) -> list:
+ """Deduplicate using difflib.SequenceMatcher on normalized titles.
+ Similarity > 0.7 means same event; keep the one with longer summary."""
+ if not raw_items:
+ return []
+
+ # Build list of (normalized_title, item)
+ normed = []
+ for item in raw_items:
+ nt = _normalize_title(item.get('title_raw', ''))
+ if nt and len(nt) >= 3:
+ normed.append((nt, item))
+
+ keep = [] # list of (nt, item) to keep
+ for nt, item in normed:
+ merged = False
+ for i, (knt, kitem) in enumerate(keep):
+ ratio = difflib.SequenceMatcher(None, nt, knt).ratio()
+ if ratio > 0.7:
+ # Same event — keep the one with longer summary
+ if len(item.get('summary_raw', '')) > len(kitem.get('summary_raw', '')):
+ keep[i] = (nt, item)
+ merged = True
+ break
+ if not merged:
+ keep.append((nt, item))
+
+ return [item for _, item in keep]
+
+
+# ─── Stage 1: LLM semantic dedup ───────────────────────────────────────────
+
+def stage1_llm_dedup(items: list, env: dict):
+ """Use LLM to identify semantic duplicates. Returns (filtered_items, error)."""
+ if not items:
+ return items, None
+
+ indexed = []
+ for i, item in enumerate(items):
+ indexed.append({
+ 'index': i,
+ 'title': item.get('title_raw', '')[:80],
+ 'summary': item.get('summary_raw', '')[:120],
+ })
+
+ prompt = (
+ '以下是AI领域的新闻条目。有些条目虽然措辞不同,但描述的是同一个事件。'
+ '请识别重复项,输出要保留的条目索引列表。只有描述完全相同的具体事件才视为重复。\n\n'
+ f'{json.dumps(indexed, ensure_ascii=False)}\n\n'
+ '请严格按以下JSON格式输出,不要包含任何其他内容:\n'
+ '{"keep_indices": [0, 1, 3, 5]}'
+ )
+
+ try:
+ raw = llm_call(prompt, env)
+ obj = _parse_json_from_llm(raw)
+ indices = obj.get('keep_indices', [])
+ if not isinstance(indices, list):
+ raise ValueError('keep_indices is not a list')
+ # Filter valid indices
+ valid = sorted(set(i for i in indices if isinstance(i, int) and 0 <= i < len(items)))
+ if not valid:
+ raise ValueError('No valid indices in keep_indices')
+ return [items[i] for i in valid], None
+ except Exception as e:
+ err = f'stage1_llm_dedup failed: {type(e).__name__}: {e}'
+ print(err)
+ return items, err # Fallback: return all items unchanged
+
+
+# ─── Stage 2a: LLM summary rewrite (parallel) ──────────────────────────────
+
+def stage2a_rewrite_summaries(items: list, env: dict):
+ """Rewrite summaries in concise Chinese. Returns (updated_items, error)."""
+ if not items:
+ return items, None
+
+ indexed = []
+ for i, item in enumerate(items):
+ indexed.append({
+ 'index': i,
+ 'title': item.get('title_raw', '')[:80],
+ 'summary': item.get('summary_raw', '')[:200],
+ })
+
+ prompt = (
+ '请将以下新闻条目的标题和摘要改写为简洁中文。'
+ '标题:英文品牌名/模型名保留原样(如GPT-5、Codex),其余翻译为中文。'
+ '摘要:每条最多120字,保留核心事实。\n\n'
+ f'{json.dumps(indexed, ensure_ascii=False)}\n\n'
+ '请严格按以下JSON格式输出:\n'
+ '{"summaries": [{"index": 0, "title": "中文标题", "summary": "改写后的摘要"}, ...]}'
+ )
+
+ try:
+ raw = llm_call(prompt, env)
+ obj = _parse_json_from_llm(raw)
+ summaries = obj.get('summaries', [])
+ if not isinstance(summaries, list):
+ raise ValueError('summaries is not a list')
+
+ result = [dict(item) for item in items] # shallow copy
+ for entry in summaries:
+ idx = entry.get('index')
+ s = entry.get('summary', '')
+ t = entry.get('title', '')
+ if isinstance(idx, int) and 0 <= idx < len(result):
+ if t:
+ result[idx] = dict(result[idx], title_raw=t)
+ if s:
+ result[idx] = dict(result[idx], summary_raw=s)
+
+ return result, None
+ except Exception as e:
+ err = f'stage2a_rewrite_summaries failed: {type(e).__name__}: {e}'
+ print(err)
+ return items, err # Fallback: return items unchanged
+
+
+# ─── Stage 2b: LLM classify (parallel) ──────────────────────────────────────
+
+def stage2b_classify(items: list, env: dict):
+ """Classify each item into a section. Returns (updated_items, error)."""
+ if not items:
+ return items, None
+
+ indexed = []
+ for i, item in enumerate(items):
+ indexed.append({
+ 'index': i,
+ 'title': item.get('title_raw', '')[:80],
+ 'summary': item.get('summary_raw', '')[:120],
+ })
+
+ sections_str = '、'.join(SECTION_ORDER)
+ prompt = (
+ f'请将以下AI新闻条目分类到对应板块。\n'
+ f'可选板块:{sections_str}\n\n'
+ f'{json.dumps(indexed, ensure_ascii=False)}\n\n'
+ '请严格按以下JSON格式输出:\n'
+ '{"sections": [{"index": 0, "section": "模型发布/更新"}, ...]}'
+ )
+
+ try:
+ raw = llm_call(prompt, env)
+ obj = _parse_json_from_llm(raw)
+ sections = obj.get('sections', [])
+ if not isinstance(sections, list):
+ raise ValueError('sections is not a list')
+
+ result = [dict(item) for item in items] # shallow copy
+ for entry in sections:
+ idx = entry.get('index')
+ sec = entry.get('section', '')
+ if isinstance(idx, int) and 0 <= idx < len(result) and sec:
+ if sec in SECTION_ORDER:
+ result[idx] = dict(result[idx], section_hint=sec)
+
+ return result, None
+ except Exception as e:
+ err = f'stage2b_classify failed: {type(e).__name__}: {e}'
+ print(err)
+ return items, err # Fallback: return items unchanged
+
+
+# ─── Stage 2 parallel execution ─────────────────────────────────────────────
+
+def stage2_parallel(items: list, env: dict):
+ """Run stage2a (summary rewrite) and stage2b (classify) in parallel.
+ Returns (merged_items, errors_list)."""
+ errors = []
+ summaries_result = items
+ classify_result = items
+
+ with ThreadPoolExecutor(max_workers=2) as executor:
+ future_summaries = executor.submit(stage2a_rewrite_summaries, items, env)
+ future_classify = executor.submit(stage2b_classify, items, env)
+
+ # Wait for summary rewrite
+ try:
+ summaries_result, err = future_summaries.result()
+ if err:
+ errors.append(err)
+ except Exception as e:
+ errors.append(f'stage2a exception: {type(e).__name__}: {e}')
+
+ # Wait for classify
+ try:
+ classify_result, err = future_classify.result()
+ if err:
+ errors.append(err)
+ except Exception as e:
+ errors.append(f'stage2b exception: {type(e).__name__}: {e}')
+
+ # Merge: take summaries from stage2a, sections from stage2b
+ merged = []
+ for i in range(len(items)):
+ new_item = dict(summaries_result[i]) if i < len(summaries_result) else dict(items[i])
+ # Apply section from classify result if available
+ if i < len(classify_result) and classify_result[i].get('section_hint'):
+ new_item['section_hint'] = classify_result[i]['section_hint']
+ merged.append(new_item)
+
+ return merged, errors
+
+
+# ─── Stage 3: LLM guide/observation ────────────────────────────────────────
+
+def llm_generate_guide(items, today: str, env: dict) -> str:
+ """Generate editorial judgment section: main theme + signals + risk."""
+ indexed = []
+ for i, item in enumerate(items, 1):
+ indexed.append({
+ 'n': i,
+ 'title': item['title'],
+ 'summary': item['summary'][:100],
+ 'section': item['section'],
+ 'source': item.get('source', ''),
+ })
+ prompt = {
+ 'date': today,
+ 'task': (
+ '你是AI行业编辑。根据以下已经分类和摘要改写好的条目,写「今日观察」。\n\n'
+ '格式要求:\n'
+ '【主线】blockquote格式,一句话概括今天最值得关注的趋势(不要套话,要具体)\n'
+ '【强信号】2-3条,每条格式:编号. 标题(一句话)+ 一两句说明为什么重要\n'
+ '【中信号】1-2条,格式同上\n'
+ '【待验证】1-2条,格式同上,说明为什么存疑\n\n'
+ '写作要求:\n'
+ '- 不要空泛总结(如"行业焦点转向XX"),要指向具体事件\n'
+ '- 不要引用编号如[1][3],读者看不到对应关系\n'
+ '- 不要建议("开发者应该..."之类删掉)\n'
+ '- 每条控制在2-3句话以内\n'
+ '- 用大白话,不要学术腔\n'
+ ),
+ 'items': indexed,
+ 'rule': '只输出观察文本,不要代码块、不要JSON。严格使用【主线】【强信号】【中信号】【待验证】四个标记。'
+ }
+ query = json.dumps(prompt, ensure_ascii=False)
+ try:
+ text = llm_call(query, env)
+ text = re.sub(r'^```(?:\w+)?\s*\n?', '', text)
+ text = re.sub(r'\n?```\s*$', '', text)
+ text = text.strip().strip('"').strip("'")
+ return text
+ except Exception:
+ return ''
+
+
+# ─── Rendering helpers (unchanged) ──────────────────────────────────────────
+
+def _parse_guide_sections(guide: str):
+ """Parse guide text into structured sections by 【markers】."""
+ sections = {}
+ parts = re.split(r'【(主线|强信号|中信号|待验证|建议)】', guide)
+ i = 1
+ while i < len(parts) - 1:
+ key = parts[i].strip()
+ content = parts[i + 1].strip()
+ sections[key] = content
+ i += 2
+ return sections
+
+
+def _make_ref_factory(items):
+ """Create a [N] → link converter bound to the items list."""
+ def make_ref(m):
+ idx = int(m.group(1))
+ if 1 <= idx <= len(items):
+ item = items[idx - 1]
+ url = item.get('url', '')
+ if url:
+ return f'[{idx}]'
+ return f'[{idx}]'
+ return m.group(0)
+ return make_ref
+
+
+def _render_guide_section(lines, title, text, items, is_quote=False):
+ """Render a guide section with title on its own line, content below."""
+ make_ref = _make_ref_factory(items)
+ lines.append(f'**{title}**')
+ lines.append('')
+ for gline in text.split('\n'):
+ gline = gline.strip()
+ if not gline:
+ continue
+ gline = re.sub(r'\[(\d+)\]', make_ref, gline)
+ gline = re.sub(r'\[N\]', '', gline)
+ gline = gline.strip()
+ if not gline:
+ continue
+ if is_quote:
+ lines.append(f'> {gline}')
+ else:
+ lines.append(gline)
+ lines.append('')
+
+
+def format_source_link(item):
+ source = item.get('source') or '来源'
+ url = item.get('url') or ''
+ if url:
+ return f'[{source} ↗]({url})'
+ return source
+
+
+def blog_markdown(items, guide=None):
+ grouped = {k: [] for k in SECTION_ORDER}
+ for item in items:
+ grouped.setdefault(item['section'], []).append(item)
+ n = 1
+ lines = []
+
+ guide_items = guide if isinstance(guide, list) else []
+ make_ref = _make_ref_factory(items)
+
+ def clean_guide_text(text):
+ text = re.sub(r'\[\d+\]', '', text)
+ text = re.sub(r'\[N\]', '', text).strip()
+ text = re.sub(r'^主线判断[::]\s*', '', text)
+ text = re.sub(r'\s+', ' ', text).strip()
+ return text
+
+ # === Top: 导览 (theme only) ===
+ theme_items = [g for g in guide_items if g.get('type') == 'theme']
+ if theme_items:
+ lines.append('## 导览')
+ lines.append('')
+ for g in theme_items:
+ text = clean_guide_text(g.get('text', ''))
+ if text:
+ for para in text.split('\n'):
+ para = para.strip()
+ if para:
+ lines.append(f'> {para}')
+ lines.append('')
+
+ # === News sections ===
+ for sec in SECTION_ORDER:
+ sec_items = grouped.get(sec, [])
+ if not sec_items:
+ continue
+ lines.append(f'## {sec}')
+ lines.append('')
+ for item in sec_items:
+ summary = item['summary'].strip()
+ if len(summary) > 120:
+ summary = summary[:120].rstrip() + '…'
+ source_link = format_source_link(item)
+ if summary and summary[-1] not in '。!?…':
+ summary += '。'
+ lines.append(f'**{n}. {item["title"]}**')
+ lines.append('')
+ lines.append(f'> {summary}{source_link}')
+ lines.append('')
+ n += 1
+
+ # === Bottom: 总结 (strong/medium/risk) ===
+ type_labels = {'strong': '强信号', 'medium': '中信号', 'risk': '待验证'}
+ summary_types = ['strong', 'medium', 'risk']
+ summary_items = [g for g in guide_items if g.get('type') in summary_types]
+ if summary_items:
+ lines.append('## 总结')
+ lines.append('')
+ for t in summary_types:
+ type_items = [g for g in summary_items if g.get('type') == t]
+ if not type_items:
+ continue
+ label = type_labels.get(t, t)
+ lines.append(f'**{label}**')
+ lines.append('')
+ for g in type_items:
+ text = clean_guide_text(g.get('text', ''))
+ if not text:
+ continue
+ title_match = re.search(r'^(.+?)[::]\s*', text)
+ if title_match and len(title_match.group(1)) < 60:
+ title = title_match.group(1).strip()
+ content = text[title_match.end():].strip()
+ else:
+ sentences = re.split(r'[。!?]', text)
+ title = sentences[0].strip() if sentences else text[:40]
+ content = text[len(sentences[0]):].strip()
+ if content and content[0] in '。!?':
+ content = content[1:].strip()
+ lines.append(f'- **{title}**')
+ if content:
+ lines.append(f' {content}')
+ lines.append('')
+
+ return '\n'.join(lines).strip()
+
+
+def short_summary(blog_url):
+ return f'AI日报已发布 👉 {blog_url}'
+
+
+def blog_api_request(method, path, payload=None, token=None, base_url=None):
+ url = base_url.rstrip('/') + path
+ data = None
+ headers = {'Authorization': f'Bearer {token}', 'User-Agent': UA}
+ if payload is not None:
+ data = json.dumps(payload, ensure_ascii=False).encode('utf-8')
+ headers['Content-Type'] = 'application/json'
+ req = urllib.request.Request(url, data=data, headers=headers, method=method)
+ with urllib.request.urlopen(req, timeout=25) as r:
+ return json.loads(r.read().decode('utf-8'))
+
+
+# ─── Main pipeline ──────────────────────────────────────────────────────────
+
+def main():
+ env = load_env()
+ token = env.get('BLOG_SERVICE_TOKEN') or env.get('EPHRON_SERVICE_TOKEN')
+ base_url = env.get('BLOG_API_BASE_URL', 'https://blog.ephron.ren')
+ if not token:
+ print('缺少 blog service token,已停止。')
+ sys.exit(1)
+
+ errors = []
+ source_counts = {}
+ raw_items = []
+
+ # ── Collect raw items (unchanged) ────────────────────────────────────────
+ try:
+ aihot_items, raw_daily = parse_aihot(TODAY)
+ raw_items.extend(aihot_items)
+ source_counts['AI HOT'] = len(aihot_items)
+ except urllib.error.HTTPError as e:
+ if e.code == 404:
+ print(f'今天({TODAY})的 AI HOT 完整日报还没有生成,暂不发布。')
+ return
+ raise
+
+ for name, url in RSS_FEEDS.items():
+ try:
+ parsed = parse_rss(name, url)
+ raw_items.extend(parsed)
+ source_counts[name] = len(parsed)
+ except Exception as e:
+ errors.append(f'{name}: {type(e).__name__}')
+ source_counts[name] = 0
+
+ juya_items = []
+ try:
+ juya_items = parse_juya(TODAY)
+ except Exception as e:
+ errors.append(f'橘鸦AI早报: {type(e).__name__}')
+
+ # If juya returned nothing, wait 2 minutes and retry once
+ if not juya_items:
+ print('橘鸦AI早报尚未就绪,等待 2 分钟后重试...')
+ time.sleep(120)
+ try:
+ juya_items = parse_juya(TODAY)
+ except Exception as e:
+ errors.append(f'橘鸦AI早报(重试): {type(e).__name__}')
+
+ raw_items.extend(juya_items)
+ source_counts['橘鸦AI早报'] = len(juya_items)
+
+ raw_path = OUT_DIR / 'raw_items.json'
+ raw_path.write_text(json.dumps(raw_items, ensure_ascii=False, indent=2), encoding='utf-8')
+
+ # ── Stage 0: Script dedup ────────────────────────────────────────────────
+ print(f'Stage 0: Script dedup — {len(raw_items)} raw items')
+ items = stage0_script_dedup(raw_items)
+ stage0_count = len(items)
+ print(f'Stage 0 done — {stage0_count} unique items')
+
+ # ── Stage 1: LLM semantic dedup ─────────────────────────────────────────
+ print(f'Stage 1: LLM semantic dedup')
+ items, stage1_err = stage1_llm_dedup(items, env)
+ if stage1_err:
+ errors.append(stage1_err)
+ print(f'Stage 1 done — {len(items)} items')
+
+ # ── Stage 2: Parallel summary rewrite + classify ────────────────────────
+ print(f'Stage 2: Parallel summary rewrite + classify')
+ items, stage2_errs = stage2_parallel(items, env)
+ errors.extend(stage2_errs)
+ print(f'Stage 2 done — {len(items)} items')
+
+ # ── Build final items with title/source fields ──────────────────────────
+ # At this point items still have raw fields; convert to final format
+ final_items = []
+ seen_titles = set()
+ for item in items:
+ title = clean_text(item.get('title_raw', ''))
+ summary = clean_text(item.get('summary_raw', ''))[:120]
+ if not title:
+ continue
+ norm = _normalize_title(title)
+ if norm in seen_titles:
+ continue
+ seen_titles.add(norm)
+ section = item.get('section_hint', '') or '行业与公司'
+ if section not in SECTION_ORDER:
+ section = '行业与公司'
+ final_items.append({
+ 'title': title,
+ 'summary': summary or '该条目暂无摘要。',
+ 'section': section,
+ 'url': item.get('url') or '',
+ 'source': item.get('source_label') or item.get('source_group') or '来源',
+ 'source_group': item.get('source_group') or '未知来源',
+ 'dedupe_keys': [norm],
+ })
+
+ # ── Stage 3: LLM guide/observation ──────────────────────────────────────
+ print(f'Stage 3: LLM guide generation')
+ guide_text = llm_generate_guide(final_items, TODAY, env)
+
+ # Parse guide into structured format for blog_markdown
+ guide_structured = []
+ if guide_text:
+ parsed = _parse_guide_sections(guide_text)
+ type_map = {'主线': 'theme', '强信号': 'strong', '中信号': 'medium', '待验证': 'risk'}
+ for key, text in parsed.items():
+ guide_type = type_map.get(key, 'theme')
+ if guide_type == 'theme':
+ guide_structured.append({'type': 'theme', 'text': text})
+ else:
+ # Split into individual items by numbered lines
+ lines = [l.strip() for l in text.split('\n') if l.strip()]
+ for line in lines:
+ # Remove leading number like "1. "
+ line = re.sub(r'^\d+[\.\、]\s*', '', line)
+ if line:
+ guide_structured.append({'type': guide_type, 'text': line})
+
+ # ── Stage 4: Assemble and publish ───────────────────────────────────────
+ print(f'Stage 4: Assemble and publish')
+ md = blog_markdown(final_items, guide_structured)
+ title = f'AI日报 · {TODAY}'
+ tags = ['AI日报', 'AI资讯', '人工智能']
+ payload = {'title': title, 'content': md, 'tags': tags}
+
+ dry_run = (env.get('AI_DAILY_DRY_RUN') or '').strip().lower() in ('1', 'true', 'yes')
+ if dry_run:
+ slug = f'dry-run-{TODAY}'
+ blog_url = f'{base_url}/posts/{slug}'
+ public_ok = True
+ print('AI_DAILY_DRY_RUN=1:已完成组装验证,跳过博客创建/发布。')
+ else:
+ create_resp = blog_api_request('POST', '/api/service/posts', payload=payload, token=token, base_url=base_url)
+ slug = create_resp.get('slug')
+ if not slug:
+ print('Blog 草稿创建失败:未返回 slug')
+ sys.exit(1)
+ blog_api_request('POST', f'/api/service/posts/{slug}/publish', token=token, base_url=base_url)
+ blog_url = f'{base_url}/posts/{slug}'
+
+ public_ok = False
+ try:
+ req = urllib.request.Request(blog_url, headers={'User-Agent': UA})
+ with urllib.request.urlopen(req, timeout=20) as r:
+ public_ok = getattr(r, 'status', None) == 200
+ except Exception:
+ public_ok = False
+
+ msg = short_summary(blog_url)
+ if errors:
+ msg += '\n\n注:部分补充源本次采集失败或LLM阶段出错,已自动降级:' + ';'.join(errors)
+ if not public_ok:
+ msg += '\n\n警告:blog 草稿/发布接口已返回成功,但公开链接暂未验证为 200,请人工复核。'
+
+ # Build digest for JSON output
+ digest = {
+ 'items': final_items,
+ 'featured_titles': [i['title'] for i in final_items[:6]],
+ 'guide': guide_structured,
+ }
+
+ (OUT_DIR / 'llm_digest.json').write_text(json.dumps(digest, ensure_ascii=False, indent=2), encoding='utf-8')
+ (OUT_DIR / 'blog_markdown.md').write_text(md, encoding='utf-8')
+ (OUT_DIR / 'chat_summary.txt').write_text(msg, encoding='utf-8')
+ (OUT_DIR / 'run_meta.json').write_text(json.dumps({
+ 'date': TODAY,
+ 'slug': slug,
+ 'blog_url': blog_url,
+ 'public_ok': public_ok,
+ 'errors': errors,
+ 'aihot_sections': [s.get('label') for s in raw_daily.get('sections', [])],
+ 'raw_item_count': len(raw_items),
+ 'stage0_count': stage0_count,
+ 'final_item_count': len(final_items),
+ 'has_juya': any(i.get('source_group') == '橘鸦AI早报' for i in raw_items),
+ 'source_counts': source_counts,
+ 'featured_titles': digest.get('featured_titles', []),
+ }, ensure_ascii=False, indent=2), encoding='utf-8')
+
+ print(msg)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/script/blog_markdown.md b/script/blog_markdown.md
new file mode 100644
index 0000000..2d77cda
--- /dev/null
+++ b/script/blog_markdown.md
@@ -0,0 +1,198 @@
+## 导览
+
+> > 微软与OpenAI正式分家、Anthropic提交招股书、DeepSeek计划融500亿——AI行业正在从“联盟军”转向“诸侯争霸”。
+
+## 模型发布/更新
+
+**1. Grok Imagine 1.5 预览版发布**
+
+> Grok Imagine 1.5 预览版即日起在 API 中上线,SpaceXAI 持续发力。[X:@cb_doge ↗](https://x.com/cb_doge/status/2062242490745594085)
+
+**2. MiniMax M3 1M token 解码加速 15.6 倍**
+
+> MiniMax M3 在 1M token 下解码加速 15.6 倍,FireworksAI_HQ 提供推理支持。[X:@MiniMax_AI ↗](https://x.com/MiniMax_AI/status/2062316914618388758)
+
+**3. Miso One 开源语音模型:8B 参数、110ms 延迟、一次语音克隆**
+
+> Miso One 发布 8B 参数开源语音模型,支持一次语音克隆(短样本),推理延迟 110ms,权重已开源,可自托管,API 即将推出,演示已上线。[X:@kimmonismus ↗](https://x.com/kimmonismus/status/2062210845308780639)
+
+**4. Ideogram v4.0 发布:2K 分辨率和 JSON 提示支持**
+
+> Ideogram v4.0 发布,原生 2K 分辨率,文字渲染出色,支持 JSON 提示词,可在 Krea 中体验。[X:@krea_ai ↗](https://x.com/krea_ai/status/2062227837130887567)
+
+## 产品与工具
+
+**5. Meta 面向 WhatsApp Business 的 AI 智能体现已全球上线**
+
+> Meta 为 WhatsApp Business 推出的 AI 智能体面向全球商家开放,按模型 token 使用量收费。[TechCrunch ↗](https://techcrunch.com/2026/06/03/metas-ai-agent-for-whatsapp-business-is-now-available-globally)
+
+**6. NousResearch 发布 Hermes Agent 桌面应用公测版**
+
+> NousResearch 推出 Hermes Agent 桌面应用公测版。[X:@SiliconFlowAI ↗](https://x.com/SiliconFlowAI/status/2062042813852995899)
+
+**7. xAI Grok 语音模型上线 Vapi 平台**
+
+> xAI 的 Grok STT 和 TTS 语音模型登陆企业语音 AI 平台 Vapi,可用于构建自定义语音智能体。[X:@xai ↗](https://x.com/xai/status/2062209374039499178)
+
+**8. Grok 模型登陆 Cloudflare AI Gateway**
+
+> Grok 模型现已可在 Cloudflare AI Gateway 上试用。[X:@xai ↗](https://x.com/xai/status/2062294202625696081)
+
+**9. OpenShell v0.0.55 发布:新增 Vertex AI 推理支持**
+
+> OpenShell v0.0.55 发布,新增 Google Vertex AI 推理支持,改进策略可见性、Podman 检测和 GPU 沙箱行为。[X:@NVIDIAAI ↗](https://x.com/NVIDIAAI/status/2062210034109677665)
+
+**10. Replit 上线 SEO Agent 助应用被发现**
+
+> Replit 推出 SEO Agent,扫描应用并提供修复建议,帮助应用在网页和 AI 搜索中被发现。[X:@Replit ↗](https://x.com/Replit/status/2062211976995188871)
+
+**11. OpenClaw 2026.6.1 发布:新增 Windows 节点与技能工坊**
+
+> OpenClaw 2026.6.1 发布,新增原生 Windows 节点主机、技能工坊和工作板编排,支持 MiniMax M3。[X:@openclaw ↗](https://x.com/openclaw/status/2062288421406785710)
+
+**12. Reachy Mini 添加 MCP 工具**
+
+> Reachy Mini 推出公开 MCP canary Space,支持远程工具调用。[Hugging Face:Blog ↗](https://huggingface.co/blog/adding-mcp-tools-to-reachy-mini)
+
+**13. 刚刚,Meta Skill 来了**
+
+> GitHub 热门仓库 OpenSquilla 发布,代表 Meta Skill 新动向。[量子位 ↗](https://www.qbitai.com/2026/06/428335.html)
+
+## 开发与工程
+
+**14. Qwen Cloud 全球 AI 黑客马拉松启动**
+
+> 首届 Qwen Cloud 全球 AI 黑客马拉松启动,5 大赛道,总奖金超 7 万美元(赛道冠军 1 万美元),Devpost 报名。[X:@alibaba_cloud ↗](https://x.com/alibaba_cloud/status/2062113338994172169)
+
+**15. 洪水韧性新篇章:Google 开源水文建模框架**
+
+> Google Research 开源基于 PyTorch 的水文建模框架,采用 Flood Hub 相同架构,允许各国气象部门在本地训练 AI 洪水预报模型。[Google Research:Blog ↗](https://research.google/blog/the-next-chapter-in-flood-resilience-open-sourcing-googles-hydrology-framework)
+
+**16. 文章:导致 Spark 在 Kubernetes 上 OOM 失败的两个错误配置**
+
+> 迁移 Spark 到 AKS 后,两个配置交互导致 OOM:spark.kubernetes.local.dirs.tmpfs 使 shuffle spill 改用 RAM 而非磁盘。[InfoQ AI ↗](https://www.infoq.com/articles/spark-oom-kubernetes-misconfigurations/?utm_campaign=infoq_content&utm_source=infoq&utm_medium=feed&utm_term=AI%2C+ML+%26+Data+Engineering)
+
+## 行业与公司
+
+**17. 微软与 OpenAI 分道扬镳——如今双方准备正面交锋**
+
+> 微软与 OpenAI 合作关系破裂,进入直接竞争。微软 AI 主管 Mustafa Suleyman 称微软需独立证明能力。[The Verge ↗](https://www.theverge.com/ai-artificial-intelligence/942242/microsoft-build-ai-agents-openai-competition)
+
+**18. 欧盟公布全面技术主权计划,推动芯片与 AI 自主发展**
+
+> 欧盟推出技术主权计划,扩大本土半导体、AI 和云计算供应链,减少对美亚依赖。[Bloomberg ↗](https://www.bloomberg.com/news/articles/2026-06-03/europe-unveils-sweeping-tech-sovereignty-plan-to-boost-chips-ai)
+
+**19. Sensor Tower:OpenAI 旗下 ChatGPT 月活已破 10 亿,史上最快**
+
+> Sensor Tower 估计 ChatGPT 月活于 2025 年 5 月突破 10 亿,增速史上最快;Claude 月活 5600 万,同比增 640%。[IT之家 ↗](https://www.ithome.com/0/959/083.htm)
+
+**20. 消息称 DeepSeek 首轮融资拟筹集 500 亿元,腾讯、宁德时代等参投**
+
+> DeepSeek 首轮拟融资 500 亿元,投后估值 3500-4000 亿元。创始人梁文峰出资 200 亿,腾讯拟投 100 亿,宁德时代 50 亿。[IT之家 ↗](https://www.ithome.com/0/959/249.htm)
+
+**21. Suno 完成 4 亿美元 D 轮融资**
+
+> Suno 完成 4 亿美元 D 轮融资,估值 54 亿美元,致力于让更多人体验音乐制作。[X:@suno ↗](https://x.com/suno/status/2062183524887675243)
+
+**22. 宏利香港与阿里云达成 AI 战略合作**
+
+> 宏利香港与阿里云建立战略合作,共建负责任 AI 创新框架,加速 AI 部署。[X:@alibaba_cloud ↗](https://x.com/alibaba_cloud/status/2062006591377829922)
+
+**23. 优步每月 1,500 美元的 AI 使用上限为 AI 工具定价提供参考**
+
+> 优步将 AI 工具月使用上限设为 1500 美元,为行业 AI 定价提供参考信号。[Simon Willison ↗](https://simonwillison.net/2026/Jun/3/uber-caps-usage)
+
+**24. 世界模型榜首易主!跨维智能登顶 WorldArena**
+
+> 跨维智能在 WorldArena 上登顶,成为世界模型新榜首。[量子位 ↗](https://www.qbitai.com/2026/06/428435.html)
+
+**25. 刚刚,Anthropic 提交了招股书!**
+
+> Anthropic 已提交招股书,预计最快 Q4 上市。[量子位 ↗](https://www.qbitai.com/2026/06/428407.html)
+
+## 论文与研究
+
+**26. 斯坦福大学法学院研究:人工智能的表现优于法学教授**
+
+> 斯坦福大学法学院研究显示,AI 表现优于法学教授,该结果在 Hacker News 获 104 个 Points。[law.stanford.edu ↗](https://law.stanford.edu/press/ai-outperforms-law-professors-in-stanford-law-study)
+
+**27. NVIDIA Research 在 CVPR 2026 发表三篇论文:规模化训练实现抓取、自动驾驶与智能体泛化**
+
+> NVIDIA Research 在 CVPR 2026 发表三篇论文:零样本抓取模型 GraspGen-X、自动驾驶 LCDrive、具身智能体 NitroGen,均基于大规模训练。[blogs.nvidia.com:Blog ↗](https://blogs.nvidia.com/blog/cvpr-research-grasping-driving-agent-training)
+
+**28. Anthropic 分析 832 个 AI 恶意账户:中高风险攻击者半年从 33% 跃至 56%**
+
+> Anthropic 分析 832 个被封恶意账户,67.3% 使用 AI 编写恶意软件,中高风险占比半年内从 33% 升至 56%,传统威胁评估失效。[Anthropic ↗](https://www.anthropic.com/news/AI-enabled-cyber-threats-mitre-attack)
+
+**29. 微软研究:装瓶厂 AI 从聊天到决策**
+
+> 微软在中西部装瓶厂试点三个月显示,AI 超越聊天进入决策领域,需应对真实风险和可靠性要求。[X:@MSFTResearch ↗](https://x.com/MSFTResearch/status/2062204914223169635)
+
+**30. 世界模型的功能分类**
+
+> World Labs 与李飞飞发文梳理“世界模型”概念,基于 POMDP 框架分类,指出当前所谓世界模型本质是同一循环的不同投影(如渲染器)。[X:@drfeifei ↗](https://x.com/drfeifei/status/2062247238143996275)
+
+**31. 从看懂世界到做对动作,卧安机器人 OneModel 1.7 用一条「隐式通路」打通了具身智能的关键断层**
+
+> 卧安机器人 OneModel 1.7 通过隐式通路在潜在空间完成信息传导,打通具身智能关键断层。[量子位 ↗](https://www.qbitai.com/2026/06/428703.html)
+
+## 人物与花絮
+
+**32. 黄仁勋与纳德拉共议智能体 AI 时代**
+
+> 黄仁勋与纳德拉在台北 MSBuild 同台,展示 NVIDIA 与微软从 Windows 到 AI 工厂的协作。[X:@nvidia ↗](https://x.com/nvidia/status/2062228974273716457)
+
+**33. Satya Nadella 谈微软 Build 大会主旨演讲**
+
+> Satya Nadella 在 Microsoft Build 主旨演讲,强调共同构建前沿智能生态系统。[X:@satyanadella ↗](https://x.com/satyanadella/status/2062022060176801826)
+
+**34. Karpathy 的 llm-wiki 项目获超五千星**
+
+> @karpathy 的 llm-wiki 项目几周内获 5000+ 星,理念是让 LLM 构建并维护可持续进化的维基知识库。[X:@SiliconFlowAI ↗](https://x.com/SiliconFlowAI/status/2062054848762450324)
+
+## 观点与教程
+
+**35. 智能体工程实战窍门全录**
+
+> @mvanhorn 分享智能体工程方法论:人主导方向、智能体执行,核心为 plan.md 约束行为,总结 22 条实战技巧及完整工具栈。[X:@shao__meng ↗](https://x.com/shao__meng/status/2061974983094755575)
+
+**36. Anthropic 用 Claude 赋能自助数据分析**
+
+> Anthropic 用 Claude 自动化 95% 业务分析查询,准确率约 95%,通过智能体分析栈解决概念-实体歧义等三大错误来源。[Claude:Blog ↗](https://claude.com/blog/how-anthropic-enables-self-service-data-analytics-with-claude)
+
+**37. 超越聊天机器人的直接偏好优化**
+
+> Dharma-AI 在 Hugging Face 博客发文,探讨直接偏好优化(DPO)在聊天机器人之外的广泛应用。[Hugging Face:Blog ↗](https://huggingface.co/blog/Dharma-AI/direct-preference-optimization-beyond-chatbots)
+
+**38. 演讲:选择你的 AI 副驾驶:最大化开发效率**
+
+> Sepehr Khosravi 探讨开发效率工具演变,评估 Cursor 和 Claude Code 等优势,为高级工程师提供可行技巧。[InfoQ AI ↗](https://www.infoq.com/presentations/choosing-ai-copilot/?utm_campaign=infoq_content&utm_source=infoq&utm_medium=feed&utm_term=AI%2C+ML+%26+Data+Engineering)
+
+## 总结
+
+**强信号**
+
+- **微软与OpenAl分道扬镳,双方开始正面竞争**
+ 合作终结后,微软AI主管Mustafa Suleyman称公司必须独立证明能力,这意味着微软将不再依赖OpenAI的模型,而是全力押注自研,OpenAI也失去最大云盟友。
+
+- **Anthropic提交招股书,预计最快Q4上市**
+ 这标志着安全派AI公司正式进入资本市场,与OpenAI争夺投资者注意,Claude的月活同比增长640%也为其估值提供了底气。
+
+- **ChatGPT月活突破10亿,成为史上增长最快的应用**
+ Sensor Tower数据显示ChatGPT在2025年5月达到这一里程碑,Claude月活5600万,两家头部消费级AI应用的用户粘性正在拉开差距。
+
+**中信号**
+
+- **Miso One发布8B开源语音模型,支持一次语音克隆且延迟仅110ms**
+ 权重已开放、可自托管,意味着实时语音克隆的门槛从专有API降到了个人部署,可能加速语音交互在开发者中的普及。
+
+- **欧盟公布全面技术主权计划,推动芯片与AI自主发展**
+ 计划扩大本土半导体、AI和云计算供应链,目标减少对美亚依赖——这将对全球AI公司的合规、市场准入和数据主权产生实质影响。
+
+**待验证**
+
+- **DeepSeek首轮融资拟筹500亿元,腾讯、宁德时代参投**
+ 投后估值高达3500-4000亿元,但融资消息来源为IT之家,未见官方确认。如此大体量的AI融资在国内市场是否顺利落地,存在不确定性。
+
+- **跨维智能登顶WorldArena世界模型榜首**
+ WorldArena的评测权威性尚未被广泛验证,且“世界模型”概念本身缺乏统一标准,需要看后续是否有独立第三方复现其能力。
\ No newline at end of file
diff --git a/script/run_meta.json b/script/run_meta.json
new file mode 100644
index 0000000..eba646f
--- /dev/null
+++ b/script/run_meta.json
@@ -0,0 +1,35 @@
+{
+ "date": "2026-06-04",
+ "slug": "ai-2026-06-04",
+ "blog_url": "https://blog.ephron.ren/posts/ai-2026-06-04",
+ "public_ok": true,
+ "errors": [
+ "橘鸦AI早报(重试): TimeoutError"
+ ],
+ "aihot_sections": [
+ "模型发布/更新",
+ "产品发布/更新",
+ "行业动态",
+ "论文研究",
+ "技巧与观点"
+ ],
+ "raw_item_count": 39,
+ "stage0_count": 39,
+ "final_item_count": 38,
+ "has_juya": false,
+ "source_counts": {
+ "AI HOT": 32,
+ "InfoQ AI": 2,
+ "MIT科技评论AI": 0,
+ "量子位": 5,
+ "橘鸦AI早报": 0
+ },
+ "featured_titles": [
+ "Grok Imagine 1.5 预览版发布",
+ "MiniMax M3 1M token 解码加速 15.6 倍",
+ "Miso One 开源语音模型:8B 参数、110ms 延迟、一次语音克隆",
+ "Ideogram v4.0 发布:2K 分辨率和 JSON 提示支持",
+ "Meta 面向 WhatsApp Business 的 AI 智能体现已全球上线",
+ "NousResearch 发布 Hermes Agent 桌面应用公测版"
+ ]
+}
\ No newline at end of file
diff --git a/skill/SKILL.md b/skill/SKILL.md
new file mode 100644
index 0000000..2a38bc5
--- /dev/null
+++ b/skill/SKILL.md
@@ -0,0 +1,127 @@
+---
+name: ai-daily-report-pipeline
+description: |
+ Maintain and operate the AI daily report cron pipeline (ai_daily_blog_pipeline.py).
+ Covers: script configuration, LLM prompt tuning, data sources, timeout settings,
+ output format, and publishing workflow.
+trigger:
+ - AI日报 / AI daily report / daily briefing
+ - ai_daily_blog_pipeline.py
+ - cron job 76297415d88d
+ - ai_morning_out / 橘鸦 / AI HOT
+---
+
+# AI Daily Report Pipeline
+
+Automated daily AI news digest that publishes to `blog.ephron.ren`.
+
+## Architecture
+
+- **Script**: `~/.hermes/scripts/ai_daily_blog_pipeline.py` (~1100 lines)
+- **Output dir**: `~/.hermes/scripts/ai_morning_out/`
+- **Cron job**: `76297415d88d`, schedule `0 10 * * *` (10:00 CST daily)
+- **Delivery**: `origin` (back to the chat that created it)
+- **Mode**: `no_agent: true` (script-only, no LLM wrapper)
+
+## Data Sources
+
+| Source | Type | Notes |
+|--------|------|-------|
+| AI HOT | API (aihot skill) | Primary, category-specific |
+| 橘鸦 AI 早报 | RSS (content:encoded) | Publishes ~09:34. Parsed from RSS `content:encoded` field (no article page fetch). 45s timeout. |
+| InfoQ AI | RSS | `feed.infoq.com/ai-ml-data-eng/` |
+| MIT 科技评论 AI | RSS | `technologyreview.com/topic/.../feed` |
+| 量子位 | RSS | `qbitai.com/feed` |
+
+## Processing Pipeline (4-Stage Architecture)
+
+Stages are designed to maximize script work, minimize LLM calls.
+
+### Stage 0: Script Dedup (pure Python)
+- Normalize titles: strip punctuation, lowercase, remove stopwords
+- Remove exact duplicates (title match or Jaccard > 0.85)
+- No LLM involved — deterministic
+
+### Stage 1: LLM Semantic Dedup
+- Single LLM call to find semantically equivalent items (e.g. same news from different sources)
+- Input: `{index, title, summary}` for each item
+- Output: `{duplicates: [{keep: 0, remove: [1,2]}, ...]}`
+- Removes less-detailed version of each duplicate pair
+
+### Stage 2: Parallel Summary Rewrite + Classify (2 concurrent LLM calls)
+- **Stage 2a**: Rewrite summaries + translate titles to Chinese
+ - Brand/model names preserved in English (GPT-5, Codex, etc.)
+ - Other title text translated to Chinese
+ - Summary: max 120 chars, concise Chinese
+ - Output: `{summaries: [{index, title, summary}, ...]}`
+- **Stage 2b**: Classify items into sections
+ - Sections: 模型与技术, 产品与工具, 开发与工程, 行业与公司, 研究与发现, 观点与评论
+ - Output: `{classifications: [{index, section}, ...]}`
+- Both run in parallel via `concurrent.futures.ThreadPoolExecutor`
+
+### Stage 3: LLM Guide Generation
+- Single LLM call for "今日观察" (observation/analysis)
+- Input: all item titles + summaries
+- Output: `{guide: [{type, text}, ...]}` JSON array
+- Types: `theme` (1), `strong` (2-3), `medium` (1-2), `risk` (1-2)
+- NO `advice` type
+
+### Stage 4: Script Assemble + Publish (pure Python)
+- Merge Stage 2a output (titles+summaries) with Stage 2b output (sections)
+- Assemble markdown: 导览 → 分类新闻 → 总结
+- Publish via Service API (create → publish → PATCH)
+
+## User Preferences (CRITICAL)
+
+- **NO CURATION / NO SELECTION**: Only filter exact duplicates. ALL non-duplicate items must be preserved. Do NOT use words like "精选" (curated/selected) in output. The user explicitly rejected any editorial filtering beyond deduplication.
+- **No emoji** in the output
+- **No reference numbers** like [1][3] — readers can't see what they point to. Strip all `[N]` from guide text via `clean_guide_text()`.
+- **No "主线判断:" prefix** in 导览 section — strip via regex `r'^主线判断[::]\s*'` in `clean_guide_text()`.
+- **No advice/suggestions** section — no "开发者应该..." type content. Guide types are: theme, strong, medium, risk ONLY.
+- **Concrete not generic** — avoid vague statements like "行业焦点转向XX". Point to specific events.
+- **Plain language** — no academic/formal tone, use 大白话
+- **Concise** — each guide item 2-3 sentences max
+- **Readable formatting** — summary section uses type labels as headers, then bullet-list format:
+ ```
+ **强信号**
+ - **标题**
+ 内容...
+ ```
+- Guide format: `[{type, text}]` JSON array. Types: `theme` (1), `strong` (2-3), `medium` (1-2), `risk` (1-2). NO `advice` type.
+- Structure: 导览 (blockquote, no prefix) → 新闻 → 总结 (type labels + bullet list, grouped by type)
+- Links must be verified accessible before inclusion
+
+## Key Configuration
+
+- **Cron timeout**: `cron.script_timeout_seconds: 600` (in `~/.hermes/config.yaml`)
+- **LLM urllib timeout**: 600s (in script, `urllib.request.urlopen(req, timeout=600)`)
+- **RSS fetch timeout**: 25s per regular feed. 橘鸦: 45s (GitHub Pages, 262KB RSS).
+- **LLM API**: follows the active Hermes model config by default. Current production path is Sub2API (`~/.hermes/config.yaml` → `model.provider: sub2api`, `model.default: findmini/gpt-5.5`, `model.base_url: http://sub2api.ephron.ren/v1`) with `SUB2API_API_KEY` from `~/.hermes/.env`. Keep API key, base_url, and model from the same provider family; do not mix `SUB2API_API_KEY` with Xiaomi/MiMo `base_url`, or the LLM stages will fail with 401 Unauthorized.
+- **max_items**: 30 in `_prefilter_items` — controls LLM prompt size; 38 items worked fine, 30 is conservative
+
+## Pitfalls
+
+1. **Config file is protected**: `~/.hermes/config.yaml` cannot be edited with `patch` tool. Use `sed -i 's/old/new/' ~/.hermes/config.yaml` via terminal.
+2. **橘鸦 timing**: Publishes ~09:34 CST. Script sleeps 120s if empty. Don't run before 10:00.
+3. **橘鸦 regex bug (fixed 2026-06-04)**: The `block_pattern` regex had `\\s*` (two backslashes in source = literal backslash in regex) before `` instead of `\s*` (one backslash = whitespace class). This caused the regex to never match any 橘鸦 items, silently returning empty results. The `first_real_block` qwen-ID regex was also dead (site migrated away from Qwen IDs). **Fix**: (a) split into `fetch_juya_rss` + `parse_juya`; (b) parse from RSS `content:encoded` eliminating the second HTTP fetch; (c) changed escaped backslash to whitespace class; (d) changed `.*?` to `[^<]*?` to prevent overview section from leaking into matches (the overview `概览
` has no `#N`, but the lazy `.*?` would cross the h2 boundary to find it).
+4. **橘鸦 timeout**: Now uses 45s timeout (up from 25s) because GitHub Pages can be slow and the RSS feed is ~262KB. Content is parsed from RSS `content:encoded` to avoid a second HTTP request for the article page. Falls back to fetching the article page if `content:encoded` is unavailable.
+5. **MiMo token limit**: With the 4-stage architecture, each LLM call handles a smaller prompt (dedup ~3K, summary ~6K, classify ~3K, guide ~5K). max_items=30 is safe. Old single-call approach needed max_items=18.
+6. **Gateway restart needed**: After config changes, `systemctl --user restart hermes-gateway` is required.
+7. **Timeout tuning (USER IS VERY SENSITIVE)**: User explicitly demands timeouts set to 1.5-2x of theoretical time. Being conservative causes repeated failures and user frustration. If theoretical time is ~80s, set timeout to 600s. Never start low and increment — go generous from the start. User said: "一直超时太影响体验了".
+8. **LLM prompt anti-patterns**: Never instruct LLM to "精选" (curate/select). Never ask for [N] reference numbers. Never include "建议" (advice) section. Never include "主线判断:" prefix in theme text. These all produce unwanted output.
+9. **Title translation**: Stage 2a MUST translate English titles to Chinese. Brand/model names (GPT-5, Codex, Gemini, etc.) are preserved in English. All other title text translated. If titles come back in English, check that the Stage 2a prompt includes explicit title translation instruction and the output format includes `"title"` field.
+10. **patch tool and regex**: The `patch` tool's Escape-drift detection can interfere with multi-backslash regex patterns. For complex regex changes in the pipeline script, use `terminal` with `sed -i` or a Python script that reads/writes the file directly.
+
+## Files
+
+- `run_meta.json` — last run metadata (date, slug, url, errors, source counts)
+- `raw_items.json` — raw fetched items
+- `llm_digest.json` — LLM output
+- `blog_markdown.md` — rendered blog post
+
+## References
+
+- `references/timeout-config.md` — timeout values and tuning rules for all script stages
+- `references/llm-config-auto-follow.md` — how the script auto-follows Hermes model config
+- `references/mimo-api-performance.md` — MiMo API performance characteristics
+- `references/rendering-guide.md` — blog post rendering rules
\ No newline at end of file
diff --git a/skill/references/llm-config-auto-follow.md b/skill/references/llm-config-auto-follow.md
new file mode 100644
index 0000000..08e781b
--- /dev/null
+++ b/skill/references/llm-config-auto-follow.md
@@ -0,0 +1,29 @@
+# AI Daily Pipeline — LLM Config Auto-Follow (2026-05-30)
+
+## Problem
+The daily report script had hardcoded `XIAOMI_API_KEY` / `XIAOMI_BASE_URL` env vars. When the user switches Hermes' main model provider, the script would still use the old provider unless manually updated.
+
+## Solution: `resolve_llm_config(env)`
+Added to `ai_daily_blog_pipeline.py` (replaces hardcoded reads in `llm_call()`):
+
+```python
+def resolve_llm_config(env: dict):
+ """Read Hermes config to get the active provider's API key, base_url, and model."""
+ # 1. Read ~/.hermes/config.yaml → model.provider, model.base_url, model.default
+ # 2. Read ~/.hermes/auth.json → credential_pool[provider].source (e.g. "env:XIAOMI_API_KEY")
+ # 3. Resolve env var name → actual key from .env
+ # 4. Fallback to LLM_API_KEY / XIAOMI_API_KEY if auth.json lookup fails
+ return api_key, base_url, model_name
+```
+
+## Config Sources (priority order)
+1. `~/.hermes/config.yaml` → `model.provider`, `model.base_url`, `model.default`
+2. `~/.hermes/auth.json` → `credential_pool[provider][0].source` (format: `env:VAR_NAME`)
+3. `~/.hermes/.env` → actual key value
+4. Legacy fallback: `LLM_API_KEY` / `XIAOMI_API_KEY` / `LLM_BASE_URL` / `LLM_MODEL`
+
+## Usage
+When user runs `hermes config set model.provider=minimax`, the daily report script automatically uses MiniMax's API key and endpoint on the next run. No script changes needed.
+
+## Pitfall
+The script needs `import yaml` — ensure `PyYAML` is installed. It's available in the Hermes venv but may not be in system Python.
diff --git a/skill/references/mimo-api-performance.md b/skill/references/mimo-api-performance.md
new file mode 100644
index 0000000..2584a2e
--- /dev/null
+++ b/skill/references/mimo-api-performance.md
@@ -0,0 +1,55 @@
+# MiMo-v2.5-pro API Performance Profile
+
+Empirically tested on `https://token-plan-sgp.xiaomimimo.com/v1` (2026-05-29).
+
+## Latency by Prompt Size
+
+| Prompt Size | Items | Response Time | Status |
+|-------------|-------|---------------|--------|
+| ~500 chars | 1-2 | 2-4s | ✅ Reliable |
+| ~4,500 chars | 15 | ~73s | ✅ OK |
+| ~7,400 chars | 25 | >120s | ❌ Timeout |
+| ~10,900 chars | 35 | >120s | ❌ Timeout |
+| ~19,000 chars | 65-70 | >150s | ❌ Timeout |
+
+## Key Constraints
+
+- **Max reliable prompt size: ~5K chars / ~18 items** for structured output tasks
+- Output token generation is slow (~50-80 tokens/s for large JSON outputs)
+- Simple prompts (<1K) are fast and reliable (2-4s)
+- Latency is **highly variable** — same prompt can take 73s or timeout at 150s
+- Temperature 0.2 used for structured output consistency
+
+## Implications for Cron Jobs
+
+- **Pre-filter aggressively** before sending to LLM: dedupe + source priority + cap at 18 items
+- **Cron timeout 300s** budget: ~35s data fetch + ~80s LLM = ~115s typical, but retries can push to 250s+
+- Set LLM urllib timeout to **150s** (not 300s — it won't help, just wastes cron budget)
+- **Retry 2x max** (not 3x) to stay within 300s cron budget
+- If LLM consistently times out, check if API is rate-limited (test with simple prompt first)
+
+## Workaround: Pre-filter Pattern
+
+```python
+def _prefilter_items(raw_items, max_items=18):
+ """Dedupe + prioritize before LLM call."""
+ seen = set()
+ filtered = []
+ priority_sources = {'AI HOT': 1, '橘鸦AI早报': 1, 'InfoQ AI': 2, '量子位': 2}
+ sorted_items = sorted(raw_items, key=lambda r: priority_sources.get(r.get('source_group', ''), 3))
+ for item in sorted_items:
+ norm = re.sub(r'[^\w\u4e00-\u9fff]+', '', item['title_raw'].lower())
+ if not norm or len(norm) < 3 or norm in seen:
+ continue
+ seen.add(norm)
+ filtered.append(item)
+ if len(filtered) >= max_items:
+ break
+ return filtered
+```
+
+## Alternative Providers (tested same day)
+
+- **Findmini (gpt-5.4)**: `https://api.findmini.top/gpt/v1` — returned 503
+- **OpenRouter (free models)**: returned 429 rate limit
+- **MiMo small prompts**: consistently 2-4s, reliable for simple tasks
diff --git a/skill/references/rendering-guide.md b/skill/references/rendering-guide.md
new file mode 100644
index 0000000..43e8bb2
--- /dev/null
+++ b/skill/references/rendering-guide.md
@@ -0,0 +1,65 @@
+# Rendering & Guide Formatting Reference
+
+## `clean_guide_text(text)` function (in `blog_markdown()`)
+
+Strips unwanted artifacts from LLM-generated guide text:
+
+```python
+def clean_guide_text(text):
+ # Strip all [N] reference numbers
+ text = re.sub(r'\[\d+\]', '', text)
+ text = re.sub(r'\[N\]', '', text).strip()
+ # Strip "主线判断:" prefix
+ text = re.sub(r'^主线判断[::]\s*', '', text)
+ # Clean extra whitespace
+ text = re.sub(r'\s+', ' ', text).strip()
+ return text
+```
+
+## Summary section rendering
+
+Type labels map: `{'strong': '强信号', 'medium': '中信号', 'risk': '待验证'}`
+
+Output format per type group:
+```
+## 总结
+
+**强信号**
+
+- **标题(从text第一句提取)**
+ 解释内容...
+
+- **标题**
+ 解释内容...
+
+**中信号**
+
+- **标题**
+ 解释内容...
+
+**待验证**
+
+- **标题**
+ 解释内容...
+```
+
+Title extraction logic:
+1. Try splitting on `:` or `:` — if prefix < 60 chars, use as title
+2. Otherwise, split on `。!?` and use first sentence as title
+
+## Title translation (Stage 2a)
+
+Titles are translated from English to Chinese in Stage 2a. Rules:
+- Brand names preserved: GPT-5, Codex, Gemini, OpenAI, Meta, etc.
+- Technical terms with no good Chinese equivalent: keep English
+- Everything else: translate to natural Chinese
+- LLM prompt explicitly states: "英文品牌名/模型名保留原样,其余翻译为中文"
+
+## LLM prompt for guide (as of 2026-05-30)
+
+Key instructions to LLM:
+- 不要空泛总结(如"行业焦点转向XX"),要指向具体事件
+- 不要引用编号如[1][3],读者看不到对应关系
+- 不要建议("开发者应该..."之类删掉)
+- 每条控制在2-3句话以内
+- 用大白话,不要学术腔
diff --git a/skill/references/timeout-config.md b/skill/references/timeout-config.md
new file mode 100644
index 0000000..2ff8ac5
--- /dev/null
+++ b/skill/references/timeout-config.md
@@ -0,0 +1,34 @@
+# Timeout Configuration Reference
+
+## Timeout Locations
+
+| Setting | Location | Current Value | Notes |
+|---------|----------|---------------|-------|
+| Script total timeout | `~/.hermes/config.yaml` → `cron.script_timeout_seconds` | 600s | Max time for entire script execution |
+| LLM urllib timeout | `ai_daily_blog_pipeline.py` → `llm_call()` → `urlopen(timeout=...)` | 600s | Single LLM API call timeout |
+| RSS fetch timeout | `ai_daily_blog_pipeline.py` → `fetch_text()` → `urlopen(timeout=...)` | 25s | Per-RSS-feed fetch |
+| 橘鸦 RSS timeout | `ai_daily_blog_pipeline.py` → `fetch_juya_rss()` → `urlopen(timeout=...)` | 45s | GitHub Pages can be slow; 262KB RSS |
+| 橘鸦 fallback page timeout | `ai_daily_blog_pipeline.py` → `parse_juya()` → `urlopen(timeout=...)` | 45s | Only used if content:encoded unavailable |
+| Service API timeout | `ai_daily_blog_pipeline.py` → `blog_api_request()` → `urlopen(timeout=...)` | 25s | Blog publish API call |
+| 橘鸦 wait timeout | `ai_daily_blog_pipeline.py` → sleep(120) | 120s | Wait if 橘鸦 RSS is empty |
+
+## Timeout Tuning Rules
+
+1. **Always set generously** — user explicitly wants 1.5-2x theoretical time minimum
+2. **MiMo API is slow** for long prompts — 18 items with 600s timeout works; 30+ items times out even at 600s
+3. **Config file is protected** — use `sed -i` via terminal, not `patch` tool
+4. **Gateway restart required** after config changes: `systemctl --user restart hermes-gateway`
+
+## Theoretical Timing
+
+- Script without LLM: ~10-15s (fetch + parse + publish)
+- LLM call (18 items): ~60-120s typically, can spike to 300s+
+- Total theoretical: ~80-150s
+- Recommended timeout: 600s (generous, accounts for API variability)
+
+## If Timeout Still Occurs
+
+1. Check `run_meta.json` → `llm_error` field
+2. If `TimeoutError: The read operation timed out` → LLM API is slow
+3. Check if `max_items` was increased — more items = longer LLM time
+4. Consider reducing `max_items` in `_prefilter_items()` back to 18