1105 lines
44 KiB
Python
1105 lines
44 KiB
Python
#!/usr/bin/env python3
|
||
import difflib
|
||
import json
|
||
import os
|
||
import re
|
||
import sys
|
||
import time
|
||
import urllib.request
|
||
import urllib.error
|
||
import xml.etree.ElementTree as ET
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
from datetime import datetime, timedelta, timezone
|
||
from email.utils import parsedate_to_datetime
|
||
from pathlib import Path
|
||
from urllib.parse import urlparse
|
||
|
||
|
||
UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
|
||
CST = timezone(timedelta(hours=8))
|
||
NOW = datetime.now(CST)
|
||
TODAY = NOW.date().isoformat()
|
||
SINCE = NOW - timedelta(hours=30)
|
||
SCRIPT_DIR = Path.home() / '.hermes' / 'scripts'
|
||
OUT_DIR = SCRIPT_DIR / 'ai_morning_out'
|
||
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
RSS_FEEDS = {
|
||
'InfoQ AI': 'https://feed.infoq.com/ai-ml-data-eng/',
|
||
'MIT科技评论AI': 'https://www.technologyreview.com/topic/artificial-intelligence/feed',
|
||
'量子位': 'https://www.qbitai.com/feed',
|
||
}
|
||
JUYA_RSS = 'https://imjuya.github.io/juya-ai-daily/rss.xml'
|
||
SECTION_ORDER = ['模型发布/更新', '产品与工具', '开发与工程', '行业与公司', '论文与研究', '人物与花絮', '观点与教程']
|
||
|
||
|
||
# ─── Data collection (unchanged) ────────────────────────────────────────────
|
||
|
||
def fetch_text(url: str) -> str:
|
||
req = urllib.request.Request(url, headers={'User-Agent': UA})
|
||
with urllib.request.urlopen(req, timeout=25) as r:
|
||
return r.read().decode('utf-8', 'ignore')
|
||
|
||
|
||
def parse_pubdate(text: str):
|
||
if not text:
|
||
return None
|
||
try:
|
||
dt = parsedate_to_datetime(text)
|
||
if dt.tzinfo is None:
|
||
dt = dt.replace(tzinfo=timezone.utc)
|
||
return dt.astimezone(CST)
|
||
except Exception:
|
||
return None
|
||
|
||
|
||
def clean_text(s: str) -> str:
|
||
s = re.sub(r'<[^>]+>', ' ', s or '')
|
||
s = s.replace(' ', ' ').replace('&', '&')
|
||
s = re.sub(r'\s+', ' ', s).strip()
|
||
return s
|
||
|
||
|
||
def source_name_from_url(url: str, fallback: str = '来源') -> str:
|
||
if not url:
|
||
return fallback
|
||
host = (urlparse(url).netloc or '').lower()
|
||
if host.startswith('www.'):
|
||
host = host[4:]
|
||
mapping = {
|
||
'x.com': 'X', 'twitter.com': 'X', 'github.com': 'GitHub', 'github.blog': 'GitHub Blog',
|
||
'openrouter.ai': 'OpenRouter', 'anthropic.com': 'Anthropic', 'cursor.com': 'Cursor',
|
||
'technologyreview.com': 'MIT科技评论AI', 'the-decoder.com': 'The Decoder', 'xiaohongshu.com': '小红书',
|
||
'mp.weixin.qq.com': '微信文章', 'qbitai.com': '量子位', 'ithome.com': 'IT之家', 'browse.sh': 'Browse.sh',
|
||
'huggingface.co': 'Hugging Face', 'openai.com': 'OpenAI', 'claude.com': 'Claude',
|
||
'theverge.com': 'The Verge', 'infoq.com': 'InfoQ', 'research.google': 'Google Research',
|
||
'simonwillison.net': 'Simon Willison', 'runwayml.com': 'Runway', 'perplexity.ai': 'Perplexity',
|
||
'venturebeat.com': 'VentureBeat', 'arxiv.org': 'arXiv', 'reuters.com': '路透社',
|
||
'bloomberg.com': 'Bloomberg', 'techcrunch.com': 'TechCrunch', 'wired.com': 'Wired',
|
||
'deepseek.com': 'DeepSeek', 'baidu.com': '百度', 'alibaba.com': '阿里',
|
||
}
|
||
for domain, name in mapping.items():
|
||
if host == domain or host.endswith('.' + domain):
|
||
return name
|
||
return host or fallback
|
||
|
||
|
||
def x_username_from_url(url: str) -> str:
|
||
"""Extract X/Twitter username from URL like https://x.com/OpenAIDevs/status/..."""
|
||
if not url:
|
||
return ''
|
||
host = (urlparse(url).netloc or '').lower()
|
||
if host.startswith('www.'):
|
||
host = host[4:]
|
||
if host not in ('x.com', 'twitter.com'):
|
||
return ''
|
||
parts = [p for p in urlparse(url).path.split('/') if p]
|
||
if len(parts) >= 1 and parts[0] not in ('i', 'search', 'explore', 'settings', 'notifications', 'home', 'compose'):
|
||
return parts[0]
|
||
return ''
|
||
|
||
|
||
def smart_source_label(url: str, api_source_name: str = '') -> str:
|
||
"""Generate a descriptive source label from URL, preferring specific names over generic API labels."""
|
||
x_user = x_username_from_url(url)
|
||
if x_user:
|
||
return f'X:@{x_user}'
|
||
url_name = source_name_from_url(url, '')
|
||
if url_name and url_name not in ('来源', ''):
|
||
host = (urlparse(url).netloc or '').lower()
|
||
path = (urlparse(url).path or '').lower()
|
||
if 'blog' in host or '/blog' in path or '/research' in path:
|
||
return f'{url_name}:Blog'
|
||
if '/index' in path or path.rstrip('/') in ('', '/about', '/products'):
|
||
return f'{url_name}:官网动态'
|
||
return url_name
|
||
if api_source_name and api_source_name not in ('AI HOT', '社交媒体/博客', '科技媒体', '公司官网', '公司博客', '社区/博客', '个人博客', '技术媒体'):
|
||
return api_source_name
|
||
return api_source_name or 'AI HOT'
|
||
|
||
|
||
def parse_aihot(today: str):
|
||
url = f'https://aihot.virxact.com/api/public/daily/{today}'
|
||
data = json.loads(fetch_text(url))
|
||
items = []
|
||
generated = data.get('generatedAt')
|
||
for sec in data.get('sections', []):
|
||
for it in sec.get('items', []):
|
||
item_url = (it.get('sourceUrl') or '').strip()
|
||
api_src = clean_text(it.get('sourceName', '')) or ''
|
||
items.append({
|
||
'source_group': 'AI HOT',
|
||
'source_label': smart_source_label(item_url, api_src),
|
||
'title_raw': clean_text(it.get('title', '')),
|
||
'summary_raw': clean_text(it.get('summary', '')),
|
||
'url': item_url,
|
||
'published_at': generated,
|
||
'origin_type': 'aihot_json',
|
||
'section_hint': sec.get('label') or '',
|
||
'language_hint': 'zh',
|
||
})
|
||
for flash in data.get('flashes', []) or []:
|
||
flash_url = (flash.get('sourceUrl') or '').strip()
|
||
api_src = clean_text(flash.get('sourceName', '')) or ''
|
||
items.append({
|
||
'source_group': 'AI HOT',
|
||
'source_label': smart_source_label(flash_url, api_src),
|
||
'title_raw': clean_text(flash.get('title', '')),
|
||
'summary_raw': clean_text(flash.get('summary', '')),
|
||
'url': flash_url,
|
||
'published_at': generated,
|
||
'origin_type': 'aihot_flash',
|
||
'section_hint': '快讯',
|
||
'language_hint': 'zh',
|
||
})
|
||
return items, data
|
||
|
||
|
||
def parse_rss(name: str, url: str):
|
||
xml = fetch_text(url)
|
||
root = ET.fromstring(xml)
|
||
channel = root.find('channel')
|
||
items = channel.findall('item') if channel is not None else []
|
||
out = []
|
||
for it in items[:20]:
|
||
pub = parse_pubdate(it.findtext('pubDate') or '')
|
||
if pub and pub < SINCE:
|
||
continue
|
||
link = (it.findtext('link') or '').strip()
|
||
title = clean_text(it.findtext('title') or '')
|
||
summary = clean_text(it.findtext('description') or '')
|
||
if not title:
|
||
continue
|
||
out.append({
|
||
'source_group': name,
|
||
'source_label': name,
|
||
'title_raw': title,
|
||
'summary_raw': summary,
|
||
'url': link,
|
||
'published_at': pub.isoformat() if pub else None,
|
||
'origin_type': 'rss',
|
||
'section_hint': '',
|
||
'language_hint': 'en' if len(re.findall(r'[A-Za-z]', title + ' ' + summary)) > len(re.findall(r'[\u4e00-\u9fff]', title + ' ' + summary)) else 'zh',
|
||
})
|
||
return out
|
||
|
||
|
||
def fetch_juya_rss(today: str):
|
||
"""Fetch 橘鸦 RSS and return (target_url, pub_date, html_content).
|
||
html_content is from content:encoded if available, else None.
|
||
Uses a longer timeout (45s) since GitHub Pages can be slow."""
|
||
req = urllib.request.Request(JUYA_RSS, headers={'User-Agent': UA})
|
||
with urllib.request.urlopen(req, timeout=45) as r:
|
||
xml = r.read().decode('utf-8', 'ignore')
|
||
root = ET.fromstring(xml)
|
||
channel = root.find('channel')
|
||
items = channel.findall('item') if channel is not None else []
|
||
target = None
|
||
pub = None
|
||
html_content = None
|
||
for it in items:
|
||
title = (it.findtext('title') or '').strip()
|
||
if title == today:
|
||
target = (it.findtext('link') or '').strip()
|
||
pub = parse_pubdate(it.findtext('pubDate') or '')
|
||
# Parse from RSS content:encoded to avoid a second HTTP request
|
||
ns = {'content': 'http://purl.org/rss/1.0/modules/content/'}
|
||
content_el = it.find('content:encoded', ns)
|
||
if content_el is not None and content_el.text:
|
||
html_content = content_el.text
|
||
break
|
||
return target, pub, html_content
|
||
|
||
|
||
def parse_juya(today: str):
|
||
target, pub, html_content = fetch_juya_rss(today)
|
||
if not target:
|
||
return []
|
||
|
||
# Try RSS content:encoded first; fall back to fetching the article page
|
||
if html_content is None:
|
||
try:
|
||
req = urllib.request.Request(target, headers={'User-Agent': UA})
|
||
with urllib.request.urlopen(req, timeout=45) as r:
|
||
html = r.read().decode('utf-8', 'ignore')
|
||
except Exception:
|
||
return []
|
||
m = re.search(r'<article[^>]*>(.*?)</article>', html, re.S | re.I)
|
||
if not m:
|
||
return []
|
||
article_html = m.group(1)
|
||
else:
|
||
article_html = html_content
|
||
|
||
block_pattern = re.compile(
|
||
r'<h2[^>]*>\s*(?:<a[^>]*href="(?P<title_url>[^"]+)"[^>]*>)?(?P<title_html>[^<]*?)</a>?\s*<code>#(?P<num>\d+)</code>\s*</h2>(?P<body>.*?)(?=<hr\s*/?>\s*<h2|<p><strong>提示</strong>|$)',
|
||
re.S | re.I,
|
||
)
|
||
|
||
results = []
|
||
for m in block_pattern.finditer(article_html):
|
||
title_html = m.group('title_html') or ''
|
||
title = clean_text(re.sub(r'<[^>]+>', ' ', title_html))
|
||
title_url = (m.group('title_url') or '').strip()
|
||
body_html = m.group('body') or ''
|
||
|
||
links = re.findall(r'<a[^>]*href="([^"]+)"[^>]*>', body_html, re.I)
|
||
clean_links = []
|
||
for link in links:
|
||
link = link.replace('&', '&').strip()
|
||
if not link or 'imjuya.github.io/juya-ai-daily' in link:
|
||
continue
|
||
if link not in clean_links:
|
||
clean_links.append(link)
|
||
url = clean_links[0] if clean_links else (title_url if title_url and 'imjuya.github.io/juya-ai-daily' not in title_url else target)
|
||
|
||
body_text = body_html
|
||
body_text = re.sub(r'<blockquote[^>]*>|</blockquote>', '\n', body_text, flags=re.I)
|
||
body_text = re.sub(r'</p>|</li>|</ul>|<br\s*/?>', '\n', body_text, flags=re.I)
|
||
body_text = re.sub(r'<li[^>]*>', '', body_text, flags=re.I)
|
||
body_text = re.sub(r'<a [^>]+>.*?</a>', ' ', body_text, flags=re.S | re.I)
|
||
body_text = re.sub(r'<img[^>]*>', ' ', body_text, flags=re.I)
|
||
body_text = re.sub(r'<[^>]+>', ' ', body_text)
|
||
lines = [clean_text(x) for x in body_text.split('\n') if clean_text(x)]
|
||
summary_lines = []
|
||
for line in lines:
|
||
if line.startswith('相关链接'):
|
||
break
|
||
if line == title:
|
||
continue
|
||
summary_lines.append(line)
|
||
summary = ' '.join(summary_lines[:4]).strip()
|
||
if not title:
|
||
continue
|
||
results.append({
|
||
'source_group': '橘鸦AI早报',
|
||
'source_label': source_name_from_url(url, '橘鸦AI早报') if url and 'imjuya.github.io/juya-ai-daily' not in url else '橘鸦AI早报',
|
||
'title_raw': title,
|
||
'summary_raw': summary,
|
||
'url': url,
|
||
'published_at': pub.isoformat() if pub else None,
|
||
'origin_type': 'juya_issue',
|
||
'section_hint': '',
|
||
'language_hint': 'zh',
|
||
})
|
||
return results
|
||
|
||
|
||
# ─── LLM infrastructure (unchanged) ─────────────────────────────────────────
|
||
|
||
def load_env():
|
||
env = {}
|
||
env_path = Path.home() / '.hermes' / '.env'
|
||
if env_path.exists():
|
||
text = env_path.read_text(errors='ignore')
|
||
for line in text.splitlines():
|
||
if '=' in line and not line.strip().startswith('#'):
|
||
k, v = line.split('=', 1)
|
||
env[k.strip()] = v.strip()
|
||
env.update({k: v for k, v in os.environ.items() if v})
|
||
return env
|
||
|
||
|
||
def resolve_llm_config(env: dict):
|
||
"""Read Hermes config to get the active provider's API key, base_url, and model.
|
||
|
||
Priority:
|
||
1) Explicit environment overrides for this pipeline (SUB2API / LLM_* / XIAOMI_* / XIAOMI_MIMO_*)
|
||
2) Hermes model config (config.yaml)
|
||
3) auth.json credential pool
|
||
4) Legacy env fallbacks
|
||
"""
|
||
import yaml
|
||
|
||
hermes_dir = Path.home() / '.hermes'
|
||
|
||
def first_env(*names: str) -> str:
|
||
for name in names:
|
||
val = (env.get(name) or '').strip()
|
||
if val:
|
||
return val
|
||
return ''
|
||
|
||
# Allow this script to be pinned to the current Hermes model config.
|
||
cfg_path = hermes_dir / 'config.yaml'
|
||
cfg = {}
|
||
if cfg_path.exists():
|
||
with open(cfg_path) as f:
|
||
cfg = yaml.safe_load(f) or {}
|
||
|
||
model_cfg = cfg.get('model', {}) or {}
|
||
provider = (model_cfg.get('provider') or '').strip()
|
||
base_url = (model_cfg.get('base_url') or '').rstrip('/')
|
||
model_name = (model_cfg.get('default') or '').strip()
|
||
|
||
# 1) Explicit overrides for this pipeline take precedence, but keep endpoint/key/model
|
||
# from the same provider family. Mixing SUB2API_API_KEY with XIAOMI_BASE_URL causes
|
||
# 401 after switching Hermes to a Sub2API model.
|
||
explicit_api_key = first_env('LLM_API_KEY')
|
||
explicit_base_url = first_env('LLM_BASE_URL')
|
||
explicit_model = first_env('LLM_MODEL')
|
||
|
||
if not explicit_api_key:
|
||
if provider == 'sub2api' or first_env('SUB2API_API_KEY', 'SUB2API_BASE_URL', 'SUB2API_MODEL'):
|
||
explicit_api_key = first_env('SUB2API_API_KEY')
|
||
explicit_base_url = first_env('SUB2API_BASE_URL') or base_url
|
||
explicit_model = first_env('SUB2API_MODEL') or model_name
|
||
elif first_env('XIAOMI_API_KEY', 'XIAOMI_MIMO_API_KEY', 'XIAOMI_BASE_URL', 'XIAOMI_MIMO_BASE_URL', 'XIAOMI_MODEL', 'XIAOMI_MIMO_MODEL'):
|
||
explicit_api_key = first_env('XIAOMI_API_KEY', 'XIAOMI_MIMO_API_KEY')
|
||
explicit_base_url = first_env('XIAOMI_BASE_URL', 'XIAOMI_MIMO_BASE_URL')
|
||
explicit_model = first_env('XIAOMI_MODEL', 'XIAOMI_MIMO_MODEL')
|
||
|
||
if explicit_base_url:
|
||
base_url = explicit_base_url.rstrip('/')
|
||
if explicit_model:
|
||
model_name = explicit_model
|
||
|
||
provider_def = (cfg.get('providers', {}) or {}).get(provider, {}) or {}
|
||
if not base_url and provider_def.get('base_url'):
|
||
base_url = str(provider_def.get('base_url')).rstrip('/')
|
||
if not explicit_api_key and provider_def.get('key_env'):
|
||
explicit_api_key = first_env(str(provider_def.get('key_env')))
|
||
|
||
# Fast fallback chain: if the active provider has no credentials, use a known-good
|
||
# provider/model from auth.json so the daily cron keeps publishing.
|
||
fallback_provider = first_env('LLM_FALLBACK_PROVIDER', 'XIAOMI_FALLBACK_PROVIDER') or 'openrouter'
|
||
|
||
api_key = explicit_api_key
|
||
auth_path = hermes_dir / 'auth.json'
|
||
if not api_key and auth_path.exists():
|
||
with open(auth_path) as f:
|
||
auth = json.load(f)
|
||
pool = auth.get('credential_pool', {}) or {}
|
||
provider_keys = []
|
||
if provider:
|
||
provider_keys.extend([provider, provider.replace('-', '_')])
|
||
# Known aliases for this environment.
|
||
provider_keys.extend(['sub2api', 'xiaomi', 'xiaomi_mimo', 'sensenova'])
|
||
for pkey in provider_keys:
|
||
creds = pool.get(pkey, [])
|
||
if creds:
|
||
cred = creds[0]
|
||
source = cred.get('source', '')
|
||
if source.startswith('env:'):
|
||
env_var = source[4:]
|
||
api_key = env.get(env_var, '') or api_key
|
||
if not api_key:
|
||
api_key = cred.get('access_token', '') or api_key
|
||
if not base_url:
|
||
base_url = (cred.get('base_url') or '').rstrip('/')
|
||
if not model_name:
|
||
model_name = cred.get('model', '') or model_name
|
||
break
|
||
|
||
# 3) Legacy env fallbacks.
|
||
if not api_key:
|
||
api_key = first_env('LLM_API_KEY', 'XIAOMI_API_KEY', 'XIAOMI_MIMO_API_KEY', 'OPENROUTER_API_KEY')
|
||
if not base_url:
|
||
base_url = first_env('LLM_BASE_URL', 'XIAOMI_BASE_URL', 'XIAOMI_MIMO_BASE_URL', 'OPENROUTER_BASE_URL').rstrip('/')
|
||
if not model_name:
|
||
model_name = first_env('LLM_MODEL') or 'mimo-v2.5-pro'
|
||
|
||
if not api_key and fallback_provider and auth_path.exists():
|
||
with open(auth_path) as f:
|
||
auth = json.load(f)
|
||
pool = auth.get('credential_pool', {}) or {}
|
||
for pkey in [fallback_provider, fallback_provider.replace('-', '_')]:
|
||
creds = pool.get(pkey, [])
|
||
if creds:
|
||
cred = creds[0]
|
||
source = cred.get('source', '')
|
||
if source.startswith('env:'):
|
||
env_var = source[4:]
|
||
api_key = env.get(env_var, '') or api_key
|
||
if not api_key:
|
||
api_key = cred.get('access_token', '') or api_key
|
||
if not base_url:
|
||
base_url = (cred.get('base_url') or '').rstrip('/')
|
||
if not model_name:
|
||
model_name = cred.get('model', '') or model_name
|
||
provider = fallback_provider
|
||
break
|
||
|
||
if not api_key:
|
||
raise RuntimeError(
|
||
f'No API key found for provider "{provider}" or fallback "{fallback_provider}". '
|
||
'Set SUB2API_API_KEY / XIAOMI_API_KEY / LLM_API_KEY or fix ~/.hermes/auth.json'
|
||
)
|
||
if not base_url:
|
||
raise RuntimeError(
|
||
f'No base_url found for provider "{provider}" or fallback "{fallback_provider}". '
|
||
'Set SUB2API_BASE_URL / XIAOMI_BASE_URL / LLM_BASE_URL or fix ~/.hermes/auth.json'
|
||
)
|
||
|
||
return api_key, base_url, model_name
|
||
|
||
|
||
def _try_llm_request(base_url: str, api_key: str, model: str, prompt_text: str, auth_mode: str, api_key_header: str = 'Authorization'):
|
||
payload = json.dumps({
|
||
'model': model,
|
||
'messages': [{'role': 'user', 'content': prompt_text}],
|
||
'temperature': 0.2,
|
||
'max_tokens': 8000,
|
||
}, ensure_ascii=False).encode('utf-8')
|
||
headers = {'Content-Type': 'application/json'}
|
||
if api_key_header == 'Authorization':
|
||
headers[api_key_header] = f'Bearer {api_key}' if auth_mode == 'bearer' else api_key
|
||
else:
|
||
headers[api_key_header] = api_key
|
||
req = urllib.request.Request(f'{base_url}/chat/completions', data=payload, headers=headers)
|
||
with urllib.request.urlopen(req, timeout=600) as r:
|
||
resp = json.loads(r.read().decode('utf-8'))
|
||
return resp['choices'][0]['message']['content'].strip()
|
||
|
||
|
||
def llm_call(prompt_text: str, env: dict) -> str:
|
||
api_key, base_url, model = resolve_llm_config(env)
|
||
|
||
# Use a single, explicit path so cron behavior is easy to debug.
|
||
# The earlier auth-matrix/fallback logic was making failures harder to reason about.
|
||
payload = json.dumps({
|
||
'model': model,
|
||
'messages': [{'role': 'user', 'content': prompt_text}],
|
||
'temperature': 0.2,
|
||
'max_tokens': 8000,
|
||
}, ensure_ascii=False).encode('utf-8')
|
||
|
||
req = urllib.request.Request(
|
||
f'{base_url}/chat/completions',
|
||
data=payload,
|
||
headers={'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'},
|
||
)
|
||
print(f'llm_call request: base_url={base_url}; model={model}', file=sys.stderr)
|
||
try:
|
||
with urllib.request.urlopen(req, timeout=600) as r:
|
||
resp = json.loads(r.read().decode('utf-8'))
|
||
return resp['choices'][0]['message']['content'].strip()
|
||
except urllib.error.HTTPError as e:
|
||
body = ''
|
||
try:
|
||
body = e.read().decode('utf-8', 'ignore')
|
||
except Exception:
|
||
pass
|
||
print(f'llm_call failed: HTTP {e.code} {e.reason}; base_url={base_url}; model={model}; body={body[:500]}', file=sys.stderr)
|
||
raise
|
||
|
||
|
||
def _parse_json_from_llm(text: str):
|
||
"""Strip markdown code blocks and extract a JSON object from LLM output."""
|
||
text = re.sub(r'^```(?:json)?\s*\n?', '', text)
|
||
text = re.sub(r'\n?```\s*$', '', text)
|
||
text = text.strip()
|
||
m = re.search(r'\{.*\}\s*$', text, re.S)
|
||
if not m:
|
||
raise ValueError('LLM 输出中未找到 JSON 对象')
|
||
raw_json = m.group(0)
|
||
raw_json = re.sub(r',\s*([}\]])', r'\1', raw_json)
|
||
return json.loads(raw_json)
|
||
|
||
|
||
def _normalize_title(title: str) -> str:
|
||
"""Normalize a title for dedup comparison: strip non-alphanumeric, lowercase."""
|
||
return re.sub(r'[^\w\u4e00-\u9fff]+', '', (title or '').lower())
|
||
|
||
|
||
# ─── Stage 0: Script dedup (no LLM) ────────────────────────────────────────
|
||
|
||
def stage0_script_dedup(raw_items: list) -> list:
|
||
"""Deduplicate using difflib.SequenceMatcher on normalized titles.
|
||
Similarity > 0.7 means same event; keep the one with longer summary."""
|
||
if not raw_items:
|
||
return []
|
||
|
||
# Build list of (normalized_title, item)
|
||
normed = []
|
||
for item in raw_items:
|
||
nt = _normalize_title(item.get('title_raw', ''))
|
||
if nt and len(nt) >= 3:
|
||
normed.append((nt, item))
|
||
|
||
keep = [] # list of (nt, item) to keep
|
||
for nt, item in normed:
|
||
merged = False
|
||
for i, (knt, kitem) in enumerate(keep):
|
||
ratio = difflib.SequenceMatcher(None, nt, knt).ratio()
|
||
if ratio > 0.7:
|
||
# Same event — keep the one with longer summary
|
||
if len(item.get('summary_raw', '')) > len(kitem.get('summary_raw', '')):
|
||
keep[i] = (nt, item)
|
||
merged = True
|
||
break
|
||
if not merged:
|
||
keep.append((nt, item))
|
||
|
||
return [item for _, item in keep]
|
||
|
||
|
||
# ─── Stage 1: LLM semantic dedup ───────────────────────────────────────────
|
||
|
||
def stage1_llm_dedup(items: list, env: dict):
|
||
"""Use LLM to identify semantic duplicates. Returns (filtered_items, error)."""
|
||
if not items:
|
||
return items, None
|
||
|
||
indexed = []
|
||
for i, item in enumerate(items):
|
||
indexed.append({
|
||
'index': i,
|
||
'title': item.get('title_raw', '')[:80],
|
||
'summary': item.get('summary_raw', '')[:120],
|
||
})
|
||
|
||
prompt = (
|
||
'以下是AI领域的新闻条目。有些条目虽然措辞不同,但描述的是同一个事件。'
|
||
'请识别重复项,输出要保留的条目索引列表。只有描述完全相同的具体事件才视为重复。\n\n'
|
||
f'{json.dumps(indexed, ensure_ascii=False)}\n\n'
|
||
'请严格按以下JSON格式输出,不要包含任何其他内容:\n'
|
||
'{"keep_indices": [0, 1, 3, 5]}'
|
||
)
|
||
|
||
try:
|
||
raw = llm_call(prompt, env)
|
||
obj = _parse_json_from_llm(raw)
|
||
indices = obj.get('keep_indices', [])
|
||
if not isinstance(indices, list):
|
||
raise ValueError('keep_indices is not a list')
|
||
# Filter valid indices
|
||
valid = sorted(set(i for i in indices if isinstance(i, int) and 0 <= i < len(items)))
|
||
if not valid:
|
||
raise ValueError('No valid indices in keep_indices')
|
||
return [items[i] for i in valid], None
|
||
except Exception as e:
|
||
err = f'stage1_llm_dedup failed: {type(e).__name__}: {e}'
|
||
print(err)
|
||
return items, err # Fallback: return all items unchanged
|
||
|
||
|
||
# ─── Stage 2a: LLM summary rewrite (parallel) ──────────────────────────────
|
||
|
||
def stage2a_rewrite_summaries(items: list, env: dict):
|
||
"""Rewrite summaries in concise Chinese. Returns (updated_items, error)."""
|
||
if not items:
|
||
return items, None
|
||
|
||
indexed = []
|
||
for i, item in enumerate(items):
|
||
indexed.append({
|
||
'index': i,
|
||
'title': item.get('title_raw', '')[:80],
|
||
'summary': item.get('summary_raw', '')[:200],
|
||
})
|
||
|
||
prompt = (
|
||
'请将以下新闻条目的标题和摘要改写为简洁中文。'
|
||
'标题:英文品牌名/模型名保留原样(如GPT-5、Codex),其余翻译为中文。'
|
||
'摘要:每条最多120字,保留核心事实。\n\n'
|
||
f'{json.dumps(indexed, ensure_ascii=False)}\n\n'
|
||
'请严格按以下JSON格式输出:\n'
|
||
'{"summaries": [{"index": 0, "title": "中文标题", "summary": "改写后的摘要"}, ...]}'
|
||
)
|
||
|
||
try:
|
||
raw = llm_call(prompt, env)
|
||
obj = _parse_json_from_llm(raw)
|
||
summaries = obj.get('summaries', [])
|
||
if not isinstance(summaries, list):
|
||
raise ValueError('summaries is not a list')
|
||
|
||
result = [dict(item) for item in items] # shallow copy
|
||
for entry in summaries:
|
||
idx = entry.get('index')
|
||
s = entry.get('summary', '')
|
||
t = entry.get('title', '')
|
||
if isinstance(idx, int) and 0 <= idx < len(result):
|
||
if t:
|
||
result[idx] = dict(result[idx], title_raw=t)
|
||
if s:
|
||
result[idx] = dict(result[idx], summary_raw=s)
|
||
|
||
return result, None
|
||
except Exception as e:
|
||
err = f'stage2a_rewrite_summaries failed: {type(e).__name__}: {e}'
|
||
print(err)
|
||
return items, err # Fallback: return items unchanged
|
||
|
||
|
||
# ─── Stage 2b: LLM classify (parallel) ──────────────────────────────────────
|
||
|
||
def stage2b_classify(items: list, env: dict):
|
||
"""Classify each item into a section. Returns (updated_items, error)."""
|
||
if not items:
|
||
return items, None
|
||
|
||
indexed = []
|
||
for i, item in enumerate(items):
|
||
indexed.append({
|
||
'index': i,
|
||
'title': item.get('title_raw', '')[:80],
|
||
'summary': item.get('summary_raw', '')[:120],
|
||
})
|
||
|
||
sections_str = '、'.join(SECTION_ORDER)
|
||
prompt = (
|
||
f'请将以下AI新闻条目分类到对应板块。\n'
|
||
f'可选板块:{sections_str}\n\n'
|
||
f'{json.dumps(indexed, ensure_ascii=False)}\n\n'
|
||
'请严格按以下JSON格式输出:\n'
|
||
'{"sections": [{"index": 0, "section": "模型发布/更新"}, ...]}'
|
||
)
|
||
|
||
try:
|
||
raw = llm_call(prompt, env)
|
||
obj = _parse_json_from_llm(raw)
|
||
sections = obj.get('sections', [])
|
||
if not isinstance(sections, list):
|
||
raise ValueError('sections is not a list')
|
||
|
||
result = [dict(item) for item in items] # shallow copy
|
||
for entry in sections:
|
||
idx = entry.get('index')
|
||
sec = entry.get('section', '')
|
||
if isinstance(idx, int) and 0 <= idx < len(result) and sec:
|
||
if sec in SECTION_ORDER:
|
||
result[idx] = dict(result[idx], section_hint=sec)
|
||
|
||
return result, None
|
||
except Exception as e:
|
||
err = f'stage2b_classify failed: {type(e).__name__}: {e}'
|
||
print(err)
|
||
return items, err # Fallback: return items unchanged
|
||
|
||
|
||
# ─── Stage 2 parallel execution ─────────────────────────────────────────────
|
||
|
||
def stage2_parallel(items: list, env: dict):
|
||
"""Run stage2a (summary rewrite) and stage2b (classify) in parallel.
|
||
Returns (merged_items, errors_list)."""
|
||
errors = []
|
||
summaries_result = items
|
||
classify_result = items
|
||
|
||
with ThreadPoolExecutor(max_workers=2) as executor:
|
||
future_summaries = executor.submit(stage2a_rewrite_summaries, items, env)
|
||
future_classify = executor.submit(stage2b_classify, items, env)
|
||
|
||
# Wait for summary rewrite
|
||
try:
|
||
summaries_result, err = future_summaries.result()
|
||
if err:
|
||
errors.append(err)
|
||
except Exception as e:
|
||
errors.append(f'stage2a exception: {type(e).__name__}: {e}')
|
||
|
||
# Wait for classify
|
||
try:
|
||
classify_result, err = future_classify.result()
|
||
if err:
|
||
errors.append(err)
|
||
except Exception as e:
|
||
errors.append(f'stage2b exception: {type(e).__name__}: {e}')
|
||
|
||
# Merge: take summaries from stage2a, sections from stage2b
|
||
merged = []
|
||
for i in range(len(items)):
|
||
new_item = dict(summaries_result[i]) if i < len(summaries_result) else dict(items[i])
|
||
# Apply section from classify result if available
|
||
if i < len(classify_result) and classify_result[i].get('section_hint'):
|
||
new_item['section_hint'] = classify_result[i]['section_hint']
|
||
merged.append(new_item)
|
||
|
||
return merged, errors
|
||
|
||
|
||
# ─── Stage 3: LLM guide/observation ────────────────────────────────────────
|
||
|
||
def llm_generate_guide(items, today: str, env: dict) -> str:
|
||
"""Generate editorial judgment section: main theme + signals + risk."""
|
||
indexed = []
|
||
for i, item in enumerate(items, 1):
|
||
indexed.append({
|
||
'n': i,
|
||
'title': item['title'],
|
||
'summary': item['summary'][:100],
|
||
'section': item['section'],
|
||
'source': item.get('source', ''),
|
||
})
|
||
prompt = {
|
||
'date': today,
|
||
'task': (
|
||
'你是AI行业编辑。根据以下已经分类和摘要改写好的条目,写「今日观察」。\n\n'
|
||
'格式要求:\n'
|
||
'【主线】blockquote格式,一句话概括今天最值得关注的趋势(不要套话,要具体)\n'
|
||
'【强信号】2-3条,每条格式:编号. 标题(一句话)+ 一两句说明为什么重要\n'
|
||
'【中信号】1-2条,格式同上\n'
|
||
'【待验证】1-2条,格式同上,说明为什么存疑\n\n'
|
||
'写作要求:\n'
|
||
'- 不要空泛总结(如"行业焦点转向XX"),要指向具体事件\n'
|
||
'- 不要引用编号如[1][3],读者看不到对应关系\n'
|
||
'- 不要建议("开发者应该..."之类删掉)\n'
|
||
'- 每条控制在2-3句话以内\n'
|
||
'- 用大白话,不要学术腔\n'
|
||
),
|
||
'items': indexed,
|
||
'rule': '只输出观察文本,不要代码块、不要JSON。严格使用【主线】【强信号】【中信号】【待验证】四个标记。'
|
||
}
|
||
query = json.dumps(prompt, ensure_ascii=False)
|
||
try:
|
||
text = llm_call(query, env)
|
||
text = re.sub(r'^```(?:\w+)?\s*\n?', '', text)
|
||
text = re.sub(r'\n?```\s*$', '', text)
|
||
text = text.strip().strip('"').strip("'")
|
||
return text
|
||
except Exception:
|
||
return ''
|
||
|
||
|
||
# ─── Rendering helpers (unchanged) ──────────────────────────────────────────
|
||
|
||
def _parse_guide_sections(guide: str):
|
||
"""Parse guide text into structured sections by 【markers】."""
|
||
sections = {}
|
||
parts = re.split(r'【(主线|强信号|中信号|待验证|建议)】', guide)
|
||
i = 1
|
||
while i < len(parts) - 1:
|
||
key = parts[i].strip()
|
||
content = parts[i + 1].strip()
|
||
sections[key] = content
|
||
i += 2
|
||
return sections
|
||
|
||
|
||
def _make_ref_factory(items):
|
||
"""Create a [N] → link converter bound to the items list."""
|
||
def make_ref(m):
|
||
idx = int(m.group(1))
|
||
if 1 <= idx <= len(items):
|
||
item = items[idx - 1]
|
||
url = item.get('url', '')
|
||
if url:
|
||
return f'<sup><a href="{url}">[{idx}]</a></sup>'
|
||
return f'<sup>[{idx}]</sup>'
|
||
return m.group(0)
|
||
return make_ref
|
||
|
||
|
||
def _render_guide_section(lines, title, text, items, is_quote=False):
|
||
"""Render a guide section with title on its own line, content below."""
|
||
make_ref = _make_ref_factory(items)
|
||
lines.append(f'**{title}**')
|
||
lines.append('')
|
||
for gline in text.split('\n'):
|
||
gline = gline.strip()
|
||
if not gline:
|
||
continue
|
||
gline = re.sub(r'\[(\d+)\]', make_ref, gline)
|
||
gline = re.sub(r'\[N\]', '', gline)
|
||
gline = gline.strip()
|
||
if not gline:
|
||
continue
|
||
if is_quote:
|
||
lines.append(f'> {gline}')
|
||
else:
|
||
lines.append(gline)
|
||
lines.append('')
|
||
|
||
|
||
def format_source_link(item):
|
||
source = item.get('source') or '来源'
|
||
url = item.get('url') or ''
|
||
if url:
|
||
return f'[{source} ↗]({url})'
|
||
return source
|
||
|
||
|
||
def blog_markdown(items, guide=None):
|
||
grouped = {k: [] for k in SECTION_ORDER}
|
||
for item in items:
|
||
grouped.setdefault(item['section'], []).append(item)
|
||
n = 1
|
||
lines = []
|
||
|
||
guide_items = guide if isinstance(guide, list) else []
|
||
make_ref = _make_ref_factory(items)
|
||
|
||
def clean_guide_text(text):
|
||
text = re.sub(r'\[\d+\]', '', text)
|
||
text = re.sub(r'\[N\]', '', text).strip()
|
||
text = re.sub(r'^主线判断[::]\s*', '', text)
|
||
text = re.sub(r'\s+', ' ', text).strip()
|
||
return text
|
||
|
||
# === Top: 导览 (theme only) ===
|
||
theme_items = [g for g in guide_items if g.get('type') == 'theme']
|
||
if theme_items:
|
||
lines.append('## 导览')
|
||
lines.append('')
|
||
for g in theme_items:
|
||
text = clean_guide_text(g.get('text', ''))
|
||
if text:
|
||
for para in text.split('\n'):
|
||
para = para.strip()
|
||
if para:
|
||
lines.append(f'> {para}')
|
||
lines.append('')
|
||
|
||
# === News sections ===
|
||
for sec in SECTION_ORDER:
|
||
sec_items = grouped.get(sec, [])
|
||
if not sec_items:
|
||
continue
|
||
lines.append(f'## {sec}')
|
||
lines.append('')
|
||
for item in sec_items:
|
||
summary = item['summary'].strip()
|
||
if len(summary) > 120:
|
||
summary = summary[:120].rstrip() + '…'
|
||
source_link = format_source_link(item)
|
||
if summary and summary[-1] not in '。!?…':
|
||
summary += '。'
|
||
lines.append(f'**{n}. {item["title"]}**')
|
||
lines.append('')
|
||
lines.append(f'> {summary}{source_link}')
|
||
lines.append('')
|
||
n += 1
|
||
|
||
# === Bottom: 总结 (strong/medium/risk) ===
|
||
type_labels = {'strong': '强信号', 'medium': '中信号', 'risk': '待验证'}
|
||
summary_types = ['strong', 'medium', 'risk']
|
||
summary_items = [g for g in guide_items if g.get('type') in summary_types]
|
||
if summary_items:
|
||
lines.append('## 总结')
|
||
lines.append('')
|
||
for t in summary_types:
|
||
type_items = [g for g in summary_items if g.get('type') == t]
|
||
if not type_items:
|
||
continue
|
||
label = type_labels.get(t, t)
|
||
lines.append(f'**{label}**')
|
||
lines.append('')
|
||
for g in type_items:
|
||
text = clean_guide_text(g.get('text', ''))
|
||
if not text:
|
||
continue
|
||
title_match = re.search(r'^(.+?)[::]\s*', text)
|
||
if title_match and len(title_match.group(1)) < 60:
|
||
title = title_match.group(1).strip()
|
||
content = text[title_match.end():].strip()
|
||
else:
|
||
sentences = re.split(r'[。!?]', text)
|
||
title = sentences[0].strip() if sentences else text[:40]
|
||
content = text[len(sentences[0]):].strip()
|
||
if content and content[0] in '。!?':
|
||
content = content[1:].strip()
|
||
lines.append(f'- **{title}**')
|
||
if content:
|
||
lines.append(f' {content}')
|
||
lines.append('')
|
||
|
||
return '\n'.join(lines).strip()
|
||
|
||
|
||
def short_summary(blog_url):
|
||
return f'AI日报已发布 👉 {blog_url}'
|
||
|
||
|
||
def blog_api_request(method, path, payload=None, token=None, base_url=None):
|
||
url = base_url.rstrip('/') + path
|
||
data = None
|
||
headers = {'Authorization': f'Bearer {token}', 'User-Agent': UA}
|
||
if payload is not None:
|
||
data = json.dumps(payload, ensure_ascii=False).encode('utf-8')
|
||
headers['Content-Type'] = 'application/json'
|
||
req = urllib.request.Request(url, data=data, headers=headers, method=method)
|
||
with urllib.request.urlopen(req, timeout=25) as r:
|
||
return json.loads(r.read().decode('utf-8'))
|
||
|
||
|
||
# ─── Main pipeline ──────────────────────────────────────────────────────────
|
||
|
||
def main():
|
||
env = load_env()
|
||
token = env.get('BLOG_SERVICE_TOKEN') or env.get('EPHRON_SERVICE_TOKEN')
|
||
base_url = env.get('BLOG_API_BASE_URL', 'https://blog.ephron.ren')
|
||
if not token:
|
||
print('缺少 blog service token,已停止。')
|
||
sys.exit(1)
|
||
|
||
errors = []
|
||
source_counts = {}
|
||
raw_items = []
|
||
|
||
# ── Collect raw items (unchanged) ────────────────────────────────────────
|
||
try:
|
||
aihot_items, raw_daily = parse_aihot(TODAY)
|
||
raw_items.extend(aihot_items)
|
||
source_counts['AI HOT'] = len(aihot_items)
|
||
except urllib.error.HTTPError as e:
|
||
if e.code == 404:
|
||
print(f'今天({TODAY})的 AI HOT 完整日报还没有生成,暂不发布。')
|
||
return
|
||
raise
|
||
|
||
for name, url in RSS_FEEDS.items():
|
||
try:
|
||
parsed = parse_rss(name, url)
|
||
raw_items.extend(parsed)
|
||
source_counts[name] = len(parsed)
|
||
except Exception as e:
|
||
errors.append(f'{name}: {type(e).__name__}')
|
||
source_counts[name] = 0
|
||
|
||
juya_items = []
|
||
try:
|
||
juya_items = parse_juya(TODAY)
|
||
except Exception as e:
|
||
errors.append(f'橘鸦AI早报: {type(e).__name__}')
|
||
|
||
# If juya returned nothing, wait 2 minutes and retry once
|
||
if not juya_items:
|
||
print('橘鸦AI早报尚未就绪,等待 2 分钟后重试...')
|
||
time.sleep(120)
|
||
try:
|
||
juya_items = parse_juya(TODAY)
|
||
except Exception as e:
|
||
errors.append(f'橘鸦AI早报(重试): {type(e).__name__}')
|
||
|
||
raw_items.extend(juya_items)
|
||
source_counts['橘鸦AI早报'] = len(juya_items)
|
||
|
||
raw_path = OUT_DIR / 'raw_items.json'
|
||
raw_path.write_text(json.dumps(raw_items, ensure_ascii=False, indent=2), encoding='utf-8')
|
||
|
||
# ── Stage 0: Script dedup ────────────────────────────────────────────────
|
||
print(f'Stage 0: Script dedup — {len(raw_items)} raw items')
|
||
items = stage0_script_dedup(raw_items)
|
||
stage0_count = len(items)
|
||
print(f'Stage 0 done — {stage0_count} unique items')
|
||
|
||
# ── Stage 1: LLM semantic dedup ─────────────────────────────────────────
|
||
print(f'Stage 1: LLM semantic dedup')
|
||
items, stage1_err = stage1_llm_dedup(items, env)
|
||
if stage1_err:
|
||
errors.append(stage1_err)
|
||
print(f'Stage 1 done — {len(items)} items')
|
||
|
||
# ── Stage 2: Parallel summary rewrite + classify ────────────────────────
|
||
print(f'Stage 2: Parallel summary rewrite + classify')
|
||
items, stage2_errs = stage2_parallel(items, env)
|
||
errors.extend(stage2_errs)
|
||
print(f'Stage 2 done — {len(items)} items')
|
||
|
||
# ── Build final items with title/source fields ──────────────────────────
|
||
# At this point items still have raw fields; convert to final format
|
||
final_items = []
|
||
seen_titles = set()
|
||
for item in items:
|
||
title = clean_text(item.get('title_raw', ''))
|
||
summary = clean_text(item.get('summary_raw', ''))[:120]
|
||
if not title:
|
||
continue
|
||
norm = _normalize_title(title)
|
||
if norm in seen_titles:
|
||
continue
|
||
seen_titles.add(norm)
|
||
section = item.get('section_hint', '') or '行业与公司'
|
||
if section not in SECTION_ORDER:
|
||
section = '行业与公司'
|
||
final_items.append({
|
||
'title': title,
|
||
'summary': summary or '该条目暂无摘要。',
|
||
'section': section,
|
||
'url': item.get('url') or '',
|
||
'source': item.get('source_label') or item.get('source_group') or '来源',
|
||
'source_group': item.get('source_group') or '未知来源',
|
||
'dedupe_keys': [norm],
|
||
})
|
||
|
||
# ── Stage 3: LLM guide/observation ──────────────────────────────────────
|
||
print(f'Stage 3: LLM guide generation')
|
||
guide_text = llm_generate_guide(final_items, TODAY, env)
|
||
|
||
# Parse guide into structured format for blog_markdown
|
||
guide_structured = []
|
||
if guide_text:
|
||
parsed = _parse_guide_sections(guide_text)
|
||
type_map = {'主线': 'theme', '强信号': 'strong', '中信号': 'medium', '待验证': 'risk'}
|
||
for key, text in parsed.items():
|
||
guide_type = type_map.get(key, 'theme')
|
||
if guide_type == 'theme':
|
||
guide_structured.append({'type': 'theme', 'text': text})
|
||
else:
|
||
# Split into individual items by numbered lines
|
||
lines = [l.strip() for l in text.split('\n') if l.strip()]
|
||
for line in lines:
|
||
# Remove leading number like "1. "
|
||
line = re.sub(r'^\d+[\.\、]\s*', '', line)
|
||
if line:
|
||
guide_structured.append({'type': guide_type, 'text': line})
|
||
|
||
# ── Stage 4: Assemble and publish ───────────────────────────────────────
|
||
print(f'Stage 4: Assemble and publish')
|
||
md = blog_markdown(final_items, guide_structured)
|
||
title = f'AI日报 · {TODAY}'
|
||
tags = ['AI日报', 'AI资讯', '人工智能']
|
||
payload = {'title': title, 'content': md, 'tags': tags}
|
||
|
||
dry_run = (env.get('AI_DAILY_DRY_RUN') or '').strip().lower() in ('1', 'true', 'yes')
|
||
if dry_run:
|
||
slug = f'dry-run-{TODAY}'
|
||
blog_url = f'{base_url}/posts/{slug}'
|
||
public_ok = True
|
||
print('AI_DAILY_DRY_RUN=1:已完成组装验证,跳过博客创建/发布。')
|
||
else:
|
||
create_resp = blog_api_request('POST', '/api/service/posts', payload=payload, token=token, base_url=base_url)
|
||
slug = create_resp.get('slug')
|
||
if not slug:
|
||
print('Blog 草稿创建失败:未返回 slug')
|
||
sys.exit(1)
|
||
blog_api_request('POST', f'/api/service/posts/{slug}/publish', token=token, base_url=base_url)
|
||
blog_url = f'{base_url}/posts/{slug}'
|
||
|
||
public_ok = False
|
||
try:
|
||
req = urllib.request.Request(blog_url, headers={'User-Agent': UA})
|
||
with urllib.request.urlopen(req, timeout=20) as r:
|
||
public_ok = getattr(r, 'status', None) == 200
|
||
except Exception:
|
||
public_ok = False
|
||
|
||
msg = short_summary(blog_url)
|
||
if errors:
|
||
msg += '\n\n注:部分补充源本次采集失败或LLM阶段出错,已自动降级:' + ';'.join(errors)
|
||
if not public_ok:
|
||
msg += '\n\n警告:blog 草稿/发布接口已返回成功,但公开链接暂未验证为 200,请人工复核。'
|
||
|
||
# Build digest for JSON output
|
||
digest = {
|
||
'items': final_items,
|
||
'featured_titles': [i['title'] for i in final_items[:6]],
|
||
'guide': guide_structured,
|
||
}
|
||
|
||
(OUT_DIR / 'llm_digest.json').write_text(json.dumps(digest, ensure_ascii=False, indent=2), encoding='utf-8')
|
||
(OUT_DIR / 'blog_markdown.md').write_text(md, encoding='utf-8')
|
||
(OUT_DIR / 'chat_summary.txt').write_text(msg, encoding='utf-8')
|
||
(OUT_DIR / 'run_meta.json').write_text(json.dumps({
|
||
'date': TODAY,
|
||
'slug': slug,
|
||
'blog_url': blog_url,
|
||
'public_ok': public_ok,
|
||
'errors': errors,
|
||
'aihot_sections': [s.get('label') for s in raw_daily.get('sections', [])],
|
||
'raw_item_count': len(raw_items),
|
||
'stage0_count': stage0_count,
|
||
'final_item_count': len(final_items),
|
||
'has_juya': any(i.get('source_group') == '橘鸦AI早报' for i in raw_items),
|
||
'source_counts': source_counts,
|
||
'featured_titles': digest.get('featured_titles', []),
|
||
}, ensure_ascii=False, indent=2), encoding='utf-8')
|
||
|
||
print(msg)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|