Files
ai-daily-report/script/ai_daily_blog_pipeline.py

1105 lines
44 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
import difflib
import json
import os
import re
import sys
import time
import urllib.request
import urllib.error
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timedelta, timezone
from email.utils import parsedate_to_datetime
from pathlib import Path
from urllib.parse import urlparse
UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
CST = timezone(timedelta(hours=8))
NOW = datetime.now(CST)
TODAY = NOW.date().isoformat()
SINCE = NOW - timedelta(hours=30)
SCRIPT_DIR = Path.home() / '.hermes' / 'scripts'
OUT_DIR = SCRIPT_DIR / 'ai_morning_out'
OUT_DIR.mkdir(parents=True, exist_ok=True)
RSS_FEEDS = {
'InfoQ AI': 'https://feed.infoq.com/ai-ml-data-eng/',
'MIT科技评论AI': 'https://www.technologyreview.com/topic/artificial-intelligence/feed',
'量子位': 'https://www.qbitai.com/feed',
}
JUYA_RSS = 'https://imjuya.github.io/juya-ai-daily/rss.xml'
SECTION_ORDER = ['模型发布/更新', '产品与工具', '开发与工程', '行业与公司', '论文与研究', '人物与花絮', '观点与教程']
# ─── Data collection (unchanged) ────────────────────────────────────────────
def fetch_text(url: str) -> str:
req = urllib.request.Request(url, headers={'User-Agent': UA})
with urllib.request.urlopen(req, timeout=25) as r:
return r.read().decode('utf-8', 'ignore')
def parse_pubdate(text: str):
if not text:
return None
try:
dt = parsedate_to_datetime(text)
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt.astimezone(CST)
except Exception:
return None
def clean_text(s: str) -> str:
s = re.sub(r'<[^>]+>', ' ', s or '')
s = s.replace('&nbsp;', ' ').replace('&amp;', '&')
s = re.sub(r'\s+', ' ', s).strip()
return s
def source_name_from_url(url: str, fallback: str = '来源') -> str:
if not url:
return fallback
host = (urlparse(url).netloc or '').lower()
if host.startswith('www.'):
host = host[4:]
mapping = {
'x.com': 'X', 'twitter.com': 'X', 'github.com': 'GitHub', 'github.blog': 'GitHub Blog',
'openrouter.ai': 'OpenRouter', 'anthropic.com': 'Anthropic', 'cursor.com': 'Cursor',
'technologyreview.com': 'MIT科技评论AI', 'the-decoder.com': 'The Decoder', 'xiaohongshu.com': '小红书',
'mp.weixin.qq.com': '微信文章', 'qbitai.com': '量子位', 'ithome.com': 'IT之家', 'browse.sh': 'Browse.sh',
'huggingface.co': 'Hugging Face', 'openai.com': 'OpenAI', 'claude.com': 'Claude',
'theverge.com': 'The Verge', 'infoq.com': 'InfoQ', 'research.google': 'Google Research',
'simonwillison.net': 'Simon Willison', 'runwayml.com': 'Runway', 'perplexity.ai': 'Perplexity',
'venturebeat.com': 'VentureBeat', 'arxiv.org': 'arXiv', 'reuters.com': '路透社',
'bloomberg.com': 'Bloomberg', 'techcrunch.com': 'TechCrunch', 'wired.com': 'Wired',
'deepseek.com': 'DeepSeek', 'baidu.com': '百度', 'alibaba.com': '阿里',
}
for domain, name in mapping.items():
if host == domain or host.endswith('.' + domain):
return name
return host or fallback
def x_username_from_url(url: str) -> str:
"""Extract X/Twitter username from URL like https://x.com/OpenAIDevs/status/..."""
if not url:
return ''
host = (urlparse(url).netloc or '').lower()
if host.startswith('www.'):
host = host[4:]
if host not in ('x.com', 'twitter.com'):
return ''
parts = [p for p in urlparse(url).path.split('/') if p]
if len(parts) >= 1 and parts[0] not in ('i', 'search', 'explore', 'settings', 'notifications', 'home', 'compose'):
return parts[0]
return ''
def smart_source_label(url: str, api_source_name: str = '') -> str:
"""Generate a descriptive source label from URL, preferring specific names over generic API labels."""
x_user = x_username_from_url(url)
if x_user:
return f'X@{x_user}'
url_name = source_name_from_url(url, '')
if url_name and url_name not in ('来源', ''):
host = (urlparse(url).netloc or '').lower()
path = (urlparse(url).path or '').lower()
if 'blog' in host or '/blog' in path or '/research' in path:
return f'{url_name}Blog'
if '/index' in path or path.rstrip('/') in ('', '/about', '/products'):
return f'{url_name}:官网动态'
return url_name
if api_source_name and api_source_name not in ('AI HOT', '社交媒体/博客', '科技媒体', '公司官网', '公司博客', '社区/博客', '个人博客', '技术媒体'):
return api_source_name
return api_source_name or 'AI HOT'
def parse_aihot(today: str):
url = f'https://aihot.virxact.com/api/public/daily/{today}'
data = json.loads(fetch_text(url))
items = []
generated = data.get('generatedAt')
for sec in data.get('sections', []):
for it in sec.get('items', []):
item_url = (it.get('sourceUrl') or '').strip()
api_src = clean_text(it.get('sourceName', '')) or ''
items.append({
'source_group': 'AI HOT',
'source_label': smart_source_label(item_url, api_src),
'title_raw': clean_text(it.get('title', '')),
'summary_raw': clean_text(it.get('summary', '')),
'url': item_url,
'published_at': generated,
'origin_type': 'aihot_json',
'section_hint': sec.get('label') or '',
'language_hint': 'zh',
})
for flash in data.get('flashes', []) or []:
flash_url = (flash.get('sourceUrl') or '').strip()
api_src = clean_text(flash.get('sourceName', '')) or ''
items.append({
'source_group': 'AI HOT',
'source_label': smart_source_label(flash_url, api_src),
'title_raw': clean_text(flash.get('title', '')),
'summary_raw': clean_text(flash.get('summary', '')),
'url': flash_url,
'published_at': generated,
'origin_type': 'aihot_flash',
'section_hint': '快讯',
'language_hint': 'zh',
})
return items, data
def parse_rss(name: str, url: str):
xml = fetch_text(url)
root = ET.fromstring(xml)
channel = root.find('channel')
items = channel.findall('item') if channel is not None else []
out = []
for it in items[:20]:
pub = parse_pubdate(it.findtext('pubDate') or '')
if pub and pub < SINCE:
continue
link = (it.findtext('link') or '').strip()
title = clean_text(it.findtext('title') or '')
summary = clean_text(it.findtext('description') or '')
if not title:
continue
out.append({
'source_group': name,
'source_label': name,
'title_raw': title,
'summary_raw': summary,
'url': link,
'published_at': pub.isoformat() if pub else None,
'origin_type': 'rss',
'section_hint': '',
'language_hint': 'en' if len(re.findall(r'[A-Za-z]', title + ' ' + summary)) > len(re.findall(r'[\u4e00-\u9fff]', title + ' ' + summary)) else 'zh',
})
return out
def fetch_juya_rss(today: str):
"""Fetch 橘鸦 RSS and return (target_url, pub_date, html_content).
html_content is from content:encoded if available, else None.
Uses a longer timeout (45s) since GitHub Pages can be slow."""
req = urllib.request.Request(JUYA_RSS, headers={'User-Agent': UA})
with urllib.request.urlopen(req, timeout=45) as r:
xml = r.read().decode('utf-8', 'ignore')
root = ET.fromstring(xml)
channel = root.find('channel')
items = channel.findall('item') if channel is not None else []
target = None
pub = None
html_content = None
for it in items:
title = (it.findtext('title') or '').strip()
if title == today:
target = (it.findtext('link') or '').strip()
pub = parse_pubdate(it.findtext('pubDate') or '')
# Parse from RSS content:encoded to avoid a second HTTP request
ns = {'content': 'http://purl.org/rss/1.0/modules/content/'}
content_el = it.find('content:encoded', ns)
if content_el is not None and content_el.text:
html_content = content_el.text
break
return target, pub, html_content
def parse_juya(today: str):
target, pub, html_content = fetch_juya_rss(today)
if not target:
return []
# Try RSS content:encoded first; fall back to fetching the article page
if html_content is None:
try:
req = urllib.request.Request(target, headers={'User-Agent': UA})
with urllib.request.urlopen(req, timeout=45) as r:
html = r.read().decode('utf-8', 'ignore')
except Exception:
return []
m = re.search(r'<article[^>]*>(.*?)</article>', html, re.S | re.I)
if not m:
return []
article_html = m.group(1)
else:
article_html = html_content
block_pattern = re.compile(
r'<h2[^>]*>\s*(?:<a[^>]*href="(?P<title_url>[^"]+)"[^>]*>)?(?P<title_html>[^<]*?)</a>?\s*<code>#(?P<num>\d+)</code>\s*</h2>(?P<body>.*?)(?=<hr\s*/?>\s*<h2|<p><strong>提示</strong>|$)',
re.S | re.I,
)
results = []
for m in block_pattern.finditer(article_html):
title_html = m.group('title_html') or ''
title = clean_text(re.sub(r'<[^>]+>', ' ', title_html))
title_url = (m.group('title_url') or '').strip()
body_html = m.group('body') or ''
links = re.findall(r'<a[^>]*href="([^"]+)"[^>]*>', body_html, re.I)
clean_links = []
for link in links:
link = link.replace('&amp;', '&').strip()
if not link or 'imjuya.github.io/juya-ai-daily' in link:
continue
if link not in clean_links:
clean_links.append(link)
url = clean_links[0] if clean_links else (title_url if title_url and 'imjuya.github.io/juya-ai-daily' not in title_url else target)
body_text = body_html
body_text = re.sub(r'<blockquote[^>]*>|</blockquote>', '\n', body_text, flags=re.I)
body_text = re.sub(r'</p>|</li>|</ul>|<br\s*/?>', '\n', body_text, flags=re.I)
body_text = re.sub(r'<li[^>]*>', '', body_text, flags=re.I)
body_text = re.sub(r'<a [^>]+>.*?</a>', ' ', body_text, flags=re.S | re.I)
body_text = re.sub(r'<img[^>]*>', ' ', body_text, flags=re.I)
body_text = re.sub(r'<[^>]+>', ' ', body_text)
lines = [clean_text(x) for x in body_text.split('\n') if clean_text(x)]
summary_lines = []
for line in lines:
if line.startswith('相关链接'):
break
if line == title:
continue
summary_lines.append(line)
summary = ' '.join(summary_lines[:4]).strip()
if not title:
continue
results.append({
'source_group': '橘鸦AI早报',
'source_label': source_name_from_url(url, '橘鸦AI早报') if url and 'imjuya.github.io/juya-ai-daily' not in url else '橘鸦AI早报',
'title_raw': title,
'summary_raw': summary,
'url': url,
'published_at': pub.isoformat() if pub else None,
'origin_type': 'juya_issue',
'section_hint': '',
'language_hint': 'zh',
})
return results
# ─── LLM infrastructure (unchanged) ─────────────────────────────────────────
def load_env():
env = {}
env_path = Path.home() / '.hermes' / '.env'
if env_path.exists():
text = env_path.read_text(errors='ignore')
for line in text.splitlines():
if '=' in line and not line.strip().startswith('#'):
k, v = line.split('=', 1)
env[k.strip()] = v.strip()
env.update({k: v for k, v in os.environ.items() if v})
return env
def resolve_llm_config(env: dict):
"""Read Hermes config to get the active provider's API key, base_url, and model.
Priority:
1) Explicit environment overrides for this pipeline (SUB2API / LLM_* / XIAOMI_* / XIAOMI_MIMO_*)
2) Hermes model config (config.yaml)
3) auth.json credential pool
4) Legacy env fallbacks
"""
import yaml
hermes_dir = Path.home() / '.hermes'
def first_env(*names: str) -> str:
for name in names:
val = (env.get(name) or '').strip()
if val:
return val
return ''
# Allow this script to be pinned to the current Hermes model config.
cfg_path = hermes_dir / 'config.yaml'
cfg = {}
if cfg_path.exists():
with open(cfg_path) as f:
cfg = yaml.safe_load(f) or {}
model_cfg = cfg.get('model', {}) or {}
provider = (model_cfg.get('provider') or '').strip()
base_url = (model_cfg.get('base_url') or '').rstrip('/')
model_name = (model_cfg.get('default') or '').strip()
# 1) Explicit overrides for this pipeline take precedence, but keep endpoint/key/model
# from the same provider family. Mixing SUB2API_API_KEY with XIAOMI_BASE_URL causes
# 401 after switching Hermes to a Sub2API model.
explicit_api_key = first_env('LLM_API_KEY')
explicit_base_url = first_env('LLM_BASE_URL')
explicit_model = first_env('LLM_MODEL')
if not explicit_api_key:
if provider == 'sub2api' or first_env('SUB2API_API_KEY', 'SUB2API_BASE_URL', 'SUB2API_MODEL'):
explicit_api_key = first_env('SUB2API_API_KEY')
explicit_base_url = first_env('SUB2API_BASE_URL') or base_url
explicit_model = first_env('SUB2API_MODEL') or model_name
elif first_env('XIAOMI_API_KEY', 'XIAOMI_MIMO_API_KEY', 'XIAOMI_BASE_URL', 'XIAOMI_MIMO_BASE_URL', 'XIAOMI_MODEL', 'XIAOMI_MIMO_MODEL'):
explicit_api_key = first_env('XIAOMI_API_KEY', 'XIAOMI_MIMO_API_KEY')
explicit_base_url = first_env('XIAOMI_BASE_URL', 'XIAOMI_MIMO_BASE_URL')
explicit_model = first_env('XIAOMI_MODEL', 'XIAOMI_MIMO_MODEL')
if explicit_base_url:
base_url = explicit_base_url.rstrip('/')
if explicit_model:
model_name = explicit_model
provider_def = (cfg.get('providers', {}) or {}).get(provider, {}) or {}
if not base_url and provider_def.get('base_url'):
base_url = str(provider_def.get('base_url')).rstrip('/')
if not explicit_api_key and provider_def.get('key_env'):
explicit_api_key = first_env(str(provider_def.get('key_env')))
# Fast fallback chain: if the active provider has no credentials, use a known-good
# provider/model from auth.json so the daily cron keeps publishing.
fallback_provider = first_env('LLM_FALLBACK_PROVIDER', 'XIAOMI_FALLBACK_PROVIDER') or 'openrouter'
api_key = explicit_api_key
auth_path = hermes_dir / 'auth.json'
if not api_key and auth_path.exists():
with open(auth_path) as f:
auth = json.load(f)
pool = auth.get('credential_pool', {}) or {}
provider_keys = []
if provider:
provider_keys.extend([provider, provider.replace('-', '_')])
# Known aliases for this environment.
provider_keys.extend(['sub2api', 'xiaomi', 'xiaomi_mimo', 'sensenova'])
for pkey in provider_keys:
creds = pool.get(pkey, [])
if creds:
cred = creds[0]
source = cred.get('source', '')
if source.startswith('env:'):
env_var = source[4:]
api_key = env.get(env_var, '') or api_key
if not api_key:
api_key = cred.get('access_token', '') or api_key
if not base_url:
base_url = (cred.get('base_url') or '').rstrip('/')
if not model_name:
model_name = cred.get('model', '') or model_name
break
# 3) Legacy env fallbacks.
if not api_key:
api_key = first_env('LLM_API_KEY', 'XIAOMI_API_KEY', 'XIAOMI_MIMO_API_KEY', 'OPENROUTER_API_KEY')
if not base_url:
base_url = first_env('LLM_BASE_URL', 'XIAOMI_BASE_URL', 'XIAOMI_MIMO_BASE_URL', 'OPENROUTER_BASE_URL').rstrip('/')
if not model_name:
model_name = first_env('LLM_MODEL') or 'mimo-v2.5-pro'
if not api_key and fallback_provider and auth_path.exists():
with open(auth_path) as f:
auth = json.load(f)
pool = auth.get('credential_pool', {}) or {}
for pkey in [fallback_provider, fallback_provider.replace('-', '_')]:
creds = pool.get(pkey, [])
if creds:
cred = creds[0]
source = cred.get('source', '')
if source.startswith('env:'):
env_var = source[4:]
api_key = env.get(env_var, '') or api_key
if not api_key:
api_key = cred.get('access_token', '') or api_key
if not base_url:
base_url = (cred.get('base_url') or '').rstrip('/')
if not model_name:
model_name = cred.get('model', '') or model_name
provider = fallback_provider
break
if not api_key:
raise RuntimeError(
f'No API key found for provider "{provider}" or fallback "{fallback_provider}". '
'Set SUB2API_API_KEY / XIAOMI_API_KEY / LLM_API_KEY or fix ~/.hermes/auth.json'
)
if not base_url:
raise RuntimeError(
f'No base_url found for provider "{provider}" or fallback "{fallback_provider}". '
'Set SUB2API_BASE_URL / XIAOMI_BASE_URL / LLM_BASE_URL or fix ~/.hermes/auth.json'
)
return api_key, base_url, model_name
def _try_llm_request(base_url: str, api_key: str, model: str, prompt_text: str, auth_mode: str, api_key_header: str = 'Authorization'):
payload = json.dumps({
'model': model,
'messages': [{'role': 'user', 'content': prompt_text}],
'temperature': 0.2,
'max_tokens': 8000,
}, ensure_ascii=False).encode('utf-8')
headers = {'Content-Type': 'application/json'}
if api_key_header == 'Authorization':
headers[api_key_header] = f'Bearer {api_key}' if auth_mode == 'bearer' else api_key
else:
headers[api_key_header] = api_key
req = urllib.request.Request(f'{base_url}/chat/completions', data=payload, headers=headers)
with urllib.request.urlopen(req, timeout=600) as r:
resp = json.loads(r.read().decode('utf-8'))
return resp['choices'][0]['message']['content'].strip()
def llm_call(prompt_text: str, env: dict) -> str:
api_key, base_url, model = resolve_llm_config(env)
# Use a single, explicit path so cron behavior is easy to debug.
# The earlier auth-matrix/fallback logic was making failures harder to reason about.
payload = json.dumps({
'model': model,
'messages': [{'role': 'user', 'content': prompt_text}],
'temperature': 0.2,
'max_tokens': 8000,
}, ensure_ascii=False).encode('utf-8')
req = urllib.request.Request(
f'{base_url}/chat/completions',
data=payload,
headers={'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'},
)
print(f'llm_call request: base_url={base_url}; model={model}', file=sys.stderr)
try:
with urllib.request.urlopen(req, timeout=600) as r:
resp = json.loads(r.read().decode('utf-8'))
return resp['choices'][0]['message']['content'].strip()
except urllib.error.HTTPError as e:
body = ''
try:
body = e.read().decode('utf-8', 'ignore')
except Exception:
pass
print(f'llm_call failed: HTTP {e.code} {e.reason}; base_url={base_url}; model={model}; body={body[:500]}', file=sys.stderr)
raise
def _parse_json_from_llm(text: str):
"""Strip markdown code blocks and extract a JSON object from LLM output."""
text = re.sub(r'^```(?:json)?\s*\n?', '', text)
text = re.sub(r'\n?```\s*$', '', text)
text = text.strip()
m = re.search(r'\{.*\}\s*$', text, re.S)
if not m:
raise ValueError('LLM 输出中未找到 JSON 对象')
raw_json = m.group(0)
raw_json = re.sub(r',\s*([}\]])', r'\1', raw_json)
return json.loads(raw_json)
def _normalize_title(title: str) -> str:
"""Normalize a title for dedup comparison: strip non-alphanumeric, lowercase."""
return re.sub(r'[^\w\u4e00-\u9fff]+', '', (title or '').lower())
# ─── Stage 0: Script dedup (no LLM) ────────────────────────────────────────
def stage0_script_dedup(raw_items: list) -> list:
"""Deduplicate using difflib.SequenceMatcher on normalized titles.
Similarity > 0.7 means same event; keep the one with longer summary."""
if not raw_items:
return []
# Build list of (normalized_title, item)
normed = []
for item in raw_items:
nt = _normalize_title(item.get('title_raw', ''))
if nt and len(nt) >= 3:
normed.append((nt, item))
keep = [] # list of (nt, item) to keep
for nt, item in normed:
merged = False
for i, (knt, kitem) in enumerate(keep):
ratio = difflib.SequenceMatcher(None, nt, knt).ratio()
if ratio > 0.7:
# Same event — keep the one with longer summary
if len(item.get('summary_raw', '')) > len(kitem.get('summary_raw', '')):
keep[i] = (nt, item)
merged = True
break
if not merged:
keep.append((nt, item))
return [item for _, item in keep]
# ─── Stage 1: LLM semantic dedup ───────────────────────────────────────────
def stage1_llm_dedup(items: list, env: dict):
"""Use LLM to identify semantic duplicates. Returns (filtered_items, error)."""
if not items:
return items, None
indexed = []
for i, item in enumerate(items):
indexed.append({
'index': i,
'title': item.get('title_raw', '')[:80],
'summary': item.get('summary_raw', '')[:120],
})
prompt = (
'以下是AI领域的新闻条目。有些条目虽然措辞不同但描述的是同一个事件。'
'请识别重复项,输出要保留的条目索引列表。只有描述完全相同的具体事件才视为重复。\n\n'
f'{json.dumps(indexed, ensure_ascii=False)}\n\n'
'请严格按以下JSON格式输出不要包含任何其他内容\n'
'{"keep_indices": [0, 1, 3, 5]}'
)
try:
raw = llm_call(prompt, env)
obj = _parse_json_from_llm(raw)
indices = obj.get('keep_indices', [])
if not isinstance(indices, list):
raise ValueError('keep_indices is not a list')
# Filter valid indices
valid = sorted(set(i for i in indices if isinstance(i, int) and 0 <= i < len(items)))
if not valid:
raise ValueError('No valid indices in keep_indices')
return [items[i] for i in valid], None
except Exception as e:
err = f'stage1_llm_dedup failed: {type(e).__name__}: {e}'
print(err)
return items, err # Fallback: return all items unchanged
# ─── Stage 2a: LLM summary rewrite (parallel) ──────────────────────────────
def stage2a_rewrite_summaries(items: list, env: dict):
"""Rewrite summaries in concise Chinese. Returns (updated_items, error)."""
if not items:
return items, None
indexed = []
for i, item in enumerate(items):
indexed.append({
'index': i,
'title': item.get('title_raw', '')[:80],
'summary': item.get('summary_raw', '')[:200],
})
prompt = (
'请将以下新闻条目的标题和摘要改写为简洁中文。'
'标题:英文品牌名/模型名保留原样如GPT-5、Codex其余翻译为中文。'
'摘要每条最多120字保留核心事实。\n\n'
f'{json.dumps(indexed, ensure_ascii=False)}\n\n'
'请严格按以下JSON格式输出\n'
'{"summaries": [{"index": 0, "title": "中文标题", "summary": "改写后的摘要"}, ...]}'
)
try:
raw = llm_call(prompt, env)
obj = _parse_json_from_llm(raw)
summaries = obj.get('summaries', [])
if not isinstance(summaries, list):
raise ValueError('summaries is not a list')
result = [dict(item) for item in items] # shallow copy
for entry in summaries:
idx = entry.get('index')
s = entry.get('summary', '')
t = entry.get('title', '')
if isinstance(idx, int) and 0 <= idx < len(result):
if t:
result[idx] = dict(result[idx], title_raw=t)
if s:
result[idx] = dict(result[idx], summary_raw=s)
return result, None
except Exception as e:
err = f'stage2a_rewrite_summaries failed: {type(e).__name__}: {e}'
print(err)
return items, err # Fallback: return items unchanged
# ─── Stage 2b: LLM classify (parallel) ──────────────────────────────────────
def stage2b_classify(items: list, env: dict):
"""Classify each item into a section. Returns (updated_items, error)."""
if not items:
return items, None
indexed = []
for i, item in enumerate(items):
indexed.append({
'index': i,
'title': item.get('title_raw', '')[:80],
'summary': item.get('summary_raw', '')[:120],
})
sections_str = ''.join(SECTION_ORDER)
prompt = (
f'请将以下AI新闻条目分类到对应板块。\n'
f'可选板块:{sections_str}\n\n'
f'{json.dumps(indexed, ensure_ascii=False)}\n\n'
'请严格按以下JSON格式输出\n'
'{"sections": [{"index": 0, "section": "模型发布/更新"}, ...]}'
)
try:
raw = llm_call(prompt, env)
obj = _parse_json_from_llm(raw)
sections = obj.get('sections', [])
if not isinstance(sections, list):
raise ValueError('sections is not a list')
result = [dict(item) for item in items] # shallow copy
for entry in sections:
idx = entry.get('index')
sec = entry.get('section', '')
if isinstance(idx, int) and 0 <= idx < len(result) and sec:
if sec in SECTION_ORDER:
result[idx] = dict(result[idx], section_hint=sec)
return result, None
except Exception as e:
err = f'stage2b_classify failed: {type(e).__name__}: {e}'
print(err)
return items, err # Fallback: return items unchanged
# ─── Stage 2 parallel execution ─────────────────────────────────────────────
def stage2_parallel(items: list, env: dict):
"""Run stage2a (summary rewrite) and stage2b (classify) in parallel.
Returns (merged_items, errors_list)."""
errors = []
summaries_result = items
classify_result = items
with ThreadPoolExecutor(max_workers=2) as executor:
future_summaries = executor.submit(stage2a_rewrite_summaries, items, env)
future_classify = executor.submit(stage2b_classify, items, env)
# Wait for summary rewrite
try:
summaries_result, err = future_summaries.result()
if err:
errors.append(err)
except Exception as e:
errors.append(f'stage2a exception: {type(e).__name__}: {e}')
# Wait for classify
try:
classify_result, err = future_classify.result()
if err:
errors.append(err)
except Exception as e:
errors.append(f'stage2b exception: {type(e).__name__}: {e}')
# Merge: take summaries from stage2a, sections from stage2b
merged = []
for i in range(len(items)):
new_item = dict(summaries_result[i]) if i < len(summaries_result) else dict(items[i])
# Apply section from classify result if available
if i < len(classify_result) and classify_result[i].get('section_hint'):
new_item['section_hint'] = classify_result[i]['section_hint']
merged.append(new_item)
return merged, errors
# ─── Stage 3: LLM guide/observation ────────────────────────────────────────
def llm_generate_guide(items, today: str, env: dict) -> str:
"""Generate editorial judgment section: main theme + signals + risk."""
indexed = []
for i, item in enumerate(items, 1):
indexed.append({
'n': i,
'title': item['title'],
'summary': item['summary'][:100],
'section': item['section'],
'source': item.get('source', ''),
})
prompt = {
'date': today,
'task': (
'你是AI行业编辑。根据以下已经分类和摘要改写好的条目写「今日观察」。\n\n'
'格式要求:\n'
'【主线】blockquote格式一句话概括今天最值得关注的趋势不要套话要具体\n'
'【强信号】2-3条每条格式编号. 标题(一句话)+ 一两句说明为什么重要\n'
'【中信号】1-2条格式同上\n'
'【待验证】1-2条格式同上说明为什么存疑\n\n'
'写作要求:\n'
'- 不要空泛总结(如"行业焦点转向XX"),要指向具体事件\n'
'- 不要引用编号如[1][3],读者看不到对应关系\n'
'- 不要建议("开发者应该..."之类删掉)\n'
'- 每条控制在2-3句话以内\n'
'- 用大白话,不要学术腔\n'
),
'items': indexed,
'rule': '只输出观察文本不要代码块、不要JSON。严格使用【主线】【强信号】【中信号】【待验证】四个标记。'
}
query = json.dumps(prompt, ensure_ascii=False)
try:
text = llm_call(query, env)
text = re.sub(r'^```(?:\w+)?\s*\n?', '', text)
text = re.sub(r'\n?```\s*$', '', text)
text = text.strip().strip('"').strip("'")
return text
except Exception:
return ''
# ─── Rendering helpers (unchanged) ──────────────────────────────────────────
def _parse_guide_sections(guide: str):
"""Parse guide text into structured sections by 【markers】."""
sections = {}
parts = re.split(r'【(主线|强信号|中信号|待验证|建议)】', guide)
i = 1
while i < len(parts) - 1:
key = parts[i].strip()
content = parts[i + 1].strip()
sections[key] = content
i += 2
return sections
def _make_ref_factory(items):
"""Create a [N] → link converter bound to the items list."""
def make_ref(m):
idx = int(m.group(1))
if 1 <= idx <= len(items):
item = items[idx - 1]
url = item.get('url', '')
if url:
return f'<sup><a href="{url}">[{idx}]</a></sup>'
return f'<sup>[{idx}]</sup>'
return m.group(0)
return make_ref
def _render_guide_section(lines, title, text, items, is_quote=False):
"""Render a guide section with title on its own line, content below."""
make_ref = _make_ref_factory(items)
lines.append(f'**{title}**')
lines.append('')
for gline in text.split('\n'):
gline = gline.strip()
if not gline:
continue
gline = re.sub(r'\[(\d+)\]', make_ref, gline)
gline = re.sub(r'\[N\]', '', gline)
gline = gline.strip()
if not gline:
continue
if is_quote:
lines.append(f'> {gline}')
else:
lines.append(gline)
lines.append('')
def format_source_link(item):
source = item.get('source') or '来源'
url = item.get('url') or ''
if url:
return f'[{source} ↗]({url})'
return source
def blog_markdown(items, guide=None):
grouped = {k: [] for k in SECTION_ORDER}
for item in items:
grouped.setdefault(item['section'], []).append(item)
n = 1
lines = []
guide_items = guide if isinstance(guide, list) else []
make_ref = _make_ref_factory(items)
def clean_guide_text(text):
text = re.sub(r'\[\d+\]', '', text)
text = re.sub(r'\[N\]', '', text).strip()
text = re.sub(r'^主线判断[:]\s*', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
# === Top: 导览 (theme only) ===
theme_items = [g for g in guide_items if g.get('type') == 'theme']
if theme_items:
lines.append('## 导览')
lines.append('')
for g in theme_items:
text = clean_guide_text(g.get('text', ''))
if text:
for para in text.split('\n'):
para = para.strip()
if para:
lines.append(f'> {para}')
lines.append('')
# === News sections ===
for sec in SECTION_ORDER:
sec_items = grouped.get(sec, [])
if not sec_items:
continue
lines.append(f'## {sec}')
lines.append('')
for item in sec_items:
summary = item['summary'].strip()
if len(summary) > 120:
summary = summary[:120].rstrip() + ''
source_link = format_source_link(item)
if summary and summary[-1] not in '。!?…':
summary += ''
lines.append(f'**{n}. {item["title"]}**')
lines.append('')
lines.append(f'> {summary}{source_link}')
lines.append('')
n += 1
# === Bottom: 总结 (strong/medium/risk) ===
type_labels = {'strong': '强信号', 'medium': '中信号', 'risk': '待验证'}
summary_types = ['strong', 'medium', 'risk']
summary_items = [g for g in guide_items if g.get('type') in summary_types]
if summary_items:
lines.append('## 总结')
lines.append('')
for t in summary_types:
type_items = [g for g in summary_items if g.get('type') == t]
if not type_items:
continue
label = type_labels.get(t, t)
lines.append(f'**{label}**')
lines.append('')
for g in type_items:
text = clean_guide_text(g.get('text', ''))
if not text:
continue
title_match = re.search(r'^(.+?)[:]\s*', text)
if title_match and len(title_match.group(1)) < 60:
title = title_match.group(1).strip()
content = text[title_match.end():].strip()
else:
sentences = re.split(r'[。!?]', text)
title = sentences[0].strip() if sentences else text[:40]
content = text[len(sentences[0]):].strip()
if content and content[0] in '。!?':
content = content[1:].strip()
lines.append(f'- **{title}**')
if content:
lines.append(f' {content}')
lines.append('')
return '\n'.join(lines).strip()
def short_summary(blog_url):
return f'AI日报已发布 👉 {blog_url}'
def blog_api_request(method, path, payload=None, token=None, base_url=None):
url = base_url.rstrip('/') + path
data = None
headers = {'Authorization': f'Bearer {token}', 'User-Agent': UA}
if payload is not None:
data = json.dumps(payload, ensure_ascii=False).encode('utf-8')
headers['Content-Type'] = 'application/json'
req = urllib.request.Request(url, data=data, headers=headers, method=method)
with urllib.request.urlopen(req, timeout=25) as r:
return json.loads(r.read().decode('utf-8'))
# ─── Main pipeline ──────────────────────────────────────────────────────────
def main():
env = load_env()
token = env.get('BLOG_SERVICE_TOKEN') or env.get('EPHRON_SERVICE_TOKEN')
base_url = env.get('BLOG_API_BASE_URL', 'https://blog.ephron.ren')
if not token:
print('缺少 blog service token已停止。')
sys.exit(1)
errors = []
source_counts = {}
raw_items = []
# ── Collect raw items (unchanged) ────────────────────────────────────────
try:
aihot_items, raw_daily = parse_aihot(TODAY)
raw_items.extend(aihot_items)
source_counts['AI HOT'] = len(aihot_items)
except urllib.error.HTTPError as e:
if e.code == 404:
print(f'今天({TODAY})的 AI HOT 完整日报还没有生成,暂不发布。')
return
raise
for name, url in RSS_FEEDS.items():
try:
parsed = parse_rss(name, url)
raw_items.extend(parsed)
source_counts[name] = len(parsed)
except Exception as e:
errors.append(f'{name}: {type(e).__name__}')
source_counts[name] = 0
juya_items = []
try:
juya_items = parse_juya(TODAY)
except Exception as e:
errors.append(f'橘鸦AI早报: {type(e).__name__}')
# If juya returned nothing, wait 2 minutes and retry once
if not juya_items:
print('橘鸦AI早报尚未就绪等待 2 分钟后重试...')
time.sleep(120)
try:
juya_items = parse_juya(TODAY)
except Exception as e:
errors.append(f'橘鸦AI早报(重试): {type(e).__name__}')
raw_items.extend(juya_items)
source_counts['橘鸦AI早报'] = len(juya_items)
raw_path = OUT_DIR / 'raw_items.json'
raw_path.write_text(json.dumps(raw_items, ensure_ascii=False, indent=2), encoding='utf-8')
# ── Stage 0: Script dedup ────────────────────────────────────────────────
print(f'Stage 0: Script dedup — {len(raw_items)} raw items')
items = stage0_script_dedup(raw_items)
stage0_count = len(items)
print(f'Stage 0 done — {stage0_count} unique items')
# ── Stage 1: LLM semantic dedup ─────────────────────────────────────────
print(f'Stage 1: LLM semantic dedup')
items, stage1_err = stage1_llm_dedup(items, env)
if stage1_err:
errors.append(stage1_err)
print(f'Stage 1 done — {len(items)} items')
# ── Stage 2: Parallel summary rewrite + classify ────────────────────────
print(f'Stage 2: Parallel summary rewrite + classify')
items, stage2_errs = stage2_parallel(items, env)
errors.extend(stage2_errs)
print(f'Stage 2 done — {len(items)} items')
# ── Build final items with title/source fields ──────────────────────────
# At this point items still have raw fields; convert to final format
final_items = []
seen_titles = set()
for item in items:
title = clean_text(item.get('title_raw', ''))
summary = clean_text(item.get('summary_raw', ''))[:120]
if not title:
continue
norm = _normalize_title(title)
if norm in seen_titles:
continue
seen_titles.add(norm)
section = item.get('section_hint', '') or '行业与公司'
if section not in SECTION_ORDER:
section = '行业与公司'
final_items.append({
'title': title,
'summary': summary or '该条目暂无摘要。',
'section': section,
'url': item.get('url') or '',
'source': item.get('source_label') or item.get('source_group') or '来源',
'source_group': item.get('source_group') or '未知来源',
'dedupe_keys': [norm],
})
# ── Stage 3: LLM guide/observation ──────────────────────────────────────
print(f'Stage 3: LLM guide generation')
guide_text = llm_generate_guide(final_items, TODAY, env)
# Parse guide into structured format for blog_markdown
guide_structured = []
if guide_text:
parsed = _parse_guide_sections(guide_text)
type_map = {'主线': 'theme', '强信号': 'strong', '中信号': 'medium', '待验证': 'risk'}
for key, text in parsed.items():
guide_type = type_map.get(key, 'theme')
if guide_type == 'theme':
guide_structured.append({'type': 'theme', 'text': text})
else:
# Split into individual items by numbered lines
lines = [l.strip() for l in text.split('\n') if l.strip()]
for line in lines:
# Remove leading number like "1. "
line = re.sub(r'^\d+[\.\、]\s*', '', line)
if line:
guide_structured.append({'type': guide_type, 'text': line})
# ── Stage 4: Assemble and publish ───────────────────────────────────────
print(f'Stage 4: Assemble and publish')
md = blog_markdown(final_items, guide_structured)
title = f'AI日报 · {TODAY}'
tags = ['AI日报', 'AI资讯', '人工智能']
payload = {'title': title, 'content': md, 'tags': tags}
dry_run = (env.get('AI_DAILY_DRY_RUN') or '').strip().lower() in ('1', 'true', 'yes')
if dry_run:
slug = f'dry-run-{TODAY}'
blog_url = f'{base_url}/posts/{slug}'
public_ok = True
print('AI_DAILY_DRY_RUN=1已完成组装验证跳过博客创建/发布。')
else:
create_resp = blog_api_request('POST', '/api/service/posts', payload=payload, token=token, base_url=base_url)
slug = create_resp.get('slug')
if not slug:
print('Blog 草稿创建失败:未返回 slug')
sys.exit(1)
blog_api_request('POST', f'/api/service/posts/{slug}/publish', token=token, base_url=base_url)
blog_url = f'{base_url}/posts/{slug}'
public_ok = False
try:
req = urllib.request.Request(blog_url, headers={'User-Agent': UA})
with urllib.request.urlopen(req, timeout=20) as r:
public_ok = getattr(r, 'status', None) == 200
except Exception:
public_ok = False
msg = short_summary(blog_url)
if errors:
msg += '\n\n部分补充源本次采集失败或LLM阶段出错已自动降级' + ''.join(errors)
if not public_ok:
msg += '\n\n警告blog 草稿/发布接口已返回成功,但公开链接暂未验证为 200请人工复核。'
# Build digest for JSON output
digest = {
'items': final_items,
'featured_titles': [i['title'] for i in final_items[:6]],
'guide': guide_structured,
}
(OUT_DIR / 'llm_digest.json').write_text(json.dumps(digest, ensure_ascii=False, indent=2), encoding='utf-8')
(OUT_DIR / 'blog_markdown.md').write_text(md, encoding='utf-8')
(OUT_DIR / 'chat_summary.txt').write_text(msg, encoding='utf-8')
(OUT_DIR / 'run_meta.json').write_text(json.dumps({
'date': TODAY,
'slug': slug,
'blog_url': blog_url,
'public_ok': public_ok,
'errors': errors,
'aihot_sections': [s.get('label') for s in raw_daily.get('sections', [])],
'raw_item_count': len(raw_items),
'stage0_count': stage0_count,
'final_item_count': len(final_items),
'has_juya': any(i.get('source_group') == '橘鸦AI早报' for i in raw_items),
'source_counts': source_counts,
'featured_titles': digest.get('featured_titles', []),
}, ensure_ascii=False, indent=2), encoding='utf-8')
print(msg)
if __name__ == '__main__':
main()