#!/usr/bin/env python3 import difflib import json import os import re import sys import time import urllib.request import urllib.error import xml.etree.ElementTree as ET from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime, timedelta, timezone from email.utils import parsedate_to_datetime from pathlib import Path from urllib.parse import urlparse UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36' CST = timezone(timedelta(hours=8)) NOW = datetime.now(CST) TODAY = NOW.date().isoformat() SINCE = NOW - timedelta(hours=30) SCRIPT_DIR = Path.home() / '.hermes' / 'scripts' OUT_DIR = SCRIPT_DIR / 'ai_morning_out' OUT_DIR.mkdir(parents=True, exist_ok=True) RSS_FEEDS = { 'InfoQ AI': 'https://feed.infoq.com/ai-ml-data-eng/', 'MIT科技评论AI': 'https://www.technologyreview.com/topic/artificial-intelligence/feed', '量子位': 'https://www.qbitai.com/feed', } JUYA_RSS = 'https://imjuya.github.io/juya-ai-daily/rss.xml' SECTION_ORDER = ['模型发布/更新', '产品与工具', '开发与工程', '行业与公司', '论文与研究', '人物与花絮', '观点与教程'] # ─── Data collection (unchanged) ──────────────────────────────────────────── def fetch_text(url: str) -> str: req = urllib.request.Request(url, headers={'User-Agent': UA}) with urllib.request.urlopen(req, timeout=25) as r: return r.read().decode('utf-8', 'ignore') def parse_pubdate(text: str): if not text: return None try: dt = parsedate_to_datetime(text) if dt.tzinfo is None: dt = dt.replace(tzinfo=timezone.utc) return dt.astimezone(CST) except Exception: return None def clean_text(s: str) -> str: s = re.sub(r'<[^>]+>', ' ', s or '') s = s.replace(' ', ' ').replace('&', '&') s = re.sub(r'\s+', ' ', s).strip() return s def source_name_from_url(url: str, fallback: str = '来源') -> str: if not url: return fallback host = (urlparse(url).netloc or '').lower() if host.startswith('www.'): host = host[4:] mapping = { 'x.com': 'X', 'twitter.com': 'X', 'github.com': 'GitHub', 'github.blog': 'GitHub Blog', 'openrouter.ai': 'OpenRouter', 'anthropic.com': 'Anthropic', 'cursor.com': 'Cursor', 'technologyreview.com': 'MIT科技评论AI', 'the-decoder.com': 'The Decoder', 'xiaohongshu.com': '小红书', 'mp.weixin.qq.com': '微信文章', 'qbitai.com': '量子位', 'ithome.com': 'IT之家', 'browse.sh': 'Browse.sh', 'huggingface.co': 'Hugging Face', 'openai.com': 'OpenAI', 'claude.com': 'Claude', 'theverge.com': 'The Verge', 'infoq.com': 'InfoQ', 'research.google': 'Google Research', 'simonwillison.net': 'Simon Willison', 'runwayml.com': 'Runway', 'perplexity.ai': 'Perplexity', 'venturebeat.com': 'VentureBeat', 'arxiv.org': 'arXiv', 'reuters.com': '路透社', 'bloomberg.com': 'Bloomberg', 'techcrunch.com': 'TechCrunch', 'wired.com': 'Wired', 'deepseek.com': 'DeepSeek', 'baidu.com': '百度', 'alibaba.com': '阿里', } for domain, name in mapping.items(): if host == domain or host.endswith('.' + domain): return name return host or fallback def x_username_from_url(url: str) -> str: """Extract X/Twitter username from URL like https://x.com/OpenAIDevs/status/...""" if not url: return '' host = (urlparse(url).netloc or '').lower() if host.startswith('www.'): host = host[4:] if host not in ('x.com', 'twitter.com'): return '' parts = [p for p in urlparse(url).path.split('/') if p] if len(parts) >= 1 and parts[0] not in ('i', 'search', 'explore', 'settings', 'notifications', 'home', 'compose'): return parts[0] return '' def smart_source_label(url: str, api_source_name: str = '') -> str: """Generate a descriptive source label from URL, preferring specific names over generic API labels.""" x_user = x_username_from_url(url) if x_user: return f'X:@{x_user}' url_name = source_name_from_url(url, '') if url_name and url_name not in ('来源', ''): host = (urlparse(url).netloc or '').lower() path = (urlparse(url).path or '').lower() if 'blog' in host or '/blog' in path or '/research' in path: return f'{url_name}:Blog' if '/index' in path or path.rstrip('/') in ('', '/about', '/products'): return f'{url_name}:官网动态' return url_name if api_source_name and api_source_name not in ('AI HOT', '社交媒体/博客', '科技媒体', '公司官网', '公司博客', '社区/博客', '个人博客', '技术媒体'): return api_source_name return api_source_name or 'AI HOT' def parse_aihot(today: str): url = f'https://aihot.virxact.com/api/public/daily/{today}' data = json.loads(fetch_text(url)) items = [] generated = data.get('generatedAt') for sec in data.get('sections', []): for it in sec.get('items', []): item_url = (it.get('sourceUrl') or '').strip() api_src = clean_text(it.get('sourceName', '')) or '' items.append({ 'source_group': 'AI HOT', 'source_label': smart_source_label(item_url, api_src), 'title_raw': clean_text(it.get('title', '')), 'summary_raw': clean_text(it.get('summary', '')), 'url': item_url, 'published_at': generated, 'origin_type': 'aihot_json', 'section_hint': sec.get('label') or '', 'language_hint': 'zh', }) for flash in data.get('flashes', []) or []: flash_url = (flash.get('sourceUrl') or '').strip() api_src = clean_text(flash.get('sourceName', '')) or '' items.append({ 'source_group': 'AI HOT', 'source_label': smart_source_label(flash_url, api_src), 'title_raw': clean_text(flash.get('title', '')), 'summary_raw': clean_text(flash.get('summary', '')), 'url': flash_url, 'published_at': generated, 'origin_type': 'aihot_flash', 'section_hint': '快讯', 'language_hint': 'zh', }) return items, data def parse_rss(name: str, url: str): xml = fetch_text(url) root = ET.fromstring(xml) channel = root.find('channel') items = channel.findall('item') if channel is not None else [] out = [] for it in items[:20]: pub = parse_pubdate(it.findtext('pubDate') or '') if pub and pub < SINCE: continue link = (it.findtext('link') or '').strip() title = clean_text(it.findtext('title') or '') summary = clean_text(it.findtext('description') or '') if not title: continue out.append({ 'source_group': name, 'source_label': name, 'title_raw': title, 'summary_raw': summary, 'url': link, 'published_at': pub.isoformat() if pub else None, 'origin_type': 'rss', 'section_hint': '', 'language_hint': 'en' if len(re.findall(r'[A-Za-z]', title + ' ' + summary)) > len(re.findall(r'[\u4e00-\u9fff]', title + ' ' + summary)) else 'zh', }) return out def fetch_juya_rss(today: str): """Fetch 橘鸦 RSS and return (target_url, pub_date, html_content). html_content is from content:encoded if available, else None. Uses a longer timeout (45s) since GitHub Pages can be slow.""" req = urllib.request.Request(JUYA_RSS, headers={'User-Agent': UA}) with urllib.request.urlopen(req, timeout=45) as r: xml = r.read().decode('utf-8', 'ignore') root = ET.fromstring(xml) channel = root.find('channel') items = channel.findall('item') if channel is not None else [] target = None pub = None html_content = None for it in items: title = (it.findtext('title') or '').strip() if title == today: target = (it.findtext('link') or '').strip() pub = parse_pubdate(it.findtext('pubDate') or '') # Parse from RSS content:encoded to avoid a second HTTP request ns = {'content': 'http://purl.org/rss/1.0/modules/content/'} content_el = it.find('content:encoded', ns) if content_el is not None and content_el.text: html_content = content_el.text break return target, pub, html_content def parse_juya(today: str): target, pub, html_content = fetch_juya_rss(today) if not target: return [] # Try RSS content:encoded first; fall back to fetching the article page if html_content is None: try: req = urllib.request.Request(target, headers={'User-Agent': UA}) with urllib.request.urlopen(req, timeout=45) as r: html = r.read().decode('utf-8', 'ignore') except Exception: return [] m = re.search(r']*>(.*?)', html, re.S | re.I) if not m: return [] article_html = m.group(1) else: article_html = html_content block_pattern = re.compile( r']*>\s*(?:]*href="(?P[^"]+)"[^>]*>)?(?P[^<]*?)?\s*#(?P\d+)\s*(?P.*?)(?=\s*提示|$)', re.S | re.I, ) results = [] for m in block_pattern.finditer(article_html): title_html = m.group('title_html') or '' title = clean_text(re.sub(r'<[^>]+>', ' ', title_html)) title_url = (m.group('title_url') or '').strip() body_html = m.group('body') or '' links = re.findall(r']*href="([^"]+)"[^>]*>', body_html, re.I) clean_links = [] for link in links: link = link.replace('&', '&').strip() if not link or 'imjuya.github.io/juya-ai-daily' in link: continue if link not in clean_links: clean_links.append(link) url = clean_links[0] if clean_links else (title_url if title_url and 'imjuya.github.io/juya-ai-daily' not in title_url else target) body_text = body_html body_text = re.sub(r']*>|', '\n', body_text, flags=re.I) body_text = re.sub(r'

|||', '\n', body_text, flags=re.I) body_text = re.sub(r']*>', '', body_text, flags=re.I) body_text = re.sub(r']+>.*?', ' ', body_text, flags=re.S | re.I) body_text = re.sub(r']*>', ' ', body_text, flags=re.I) body_text = re.sub(r'<[^>]+>', ' ', body_text) lines = [clean_text(x) for x in body_text.split('\n') if clean_text(x)] summary_lines = [] for line in lines: if line.startswith('相关链接'): break if line == title: continue summary_lines.append(line) summary = ' '.join(summary_lines[:4]).strip() if not title: continue results.append({ 'source_group': '橘鸦AI早报', 'source_label': source_name_from_url(url, '橘鸦AI早报') if url and 'imjuya.github.io/juya-ai-daily' not in url else '橘鸦AI早报', 'title_raw': title, 'summary_raw': summary, 'url': url, 'published_at': pub.isoformat() if pub else None, 'origin_type': 'juya_issue', 'section_hint': '', 'language_hint': 'zh', }) return results # ─── LLM infrastructure (unchanged) ───────────────────────────────────────── def load_env(): env = {} env_path = Path.home() / '.hermes' / '.env' if env_path.exists(): text = env_path.read_text(errors='ignore') for line in text.splitlines(): if '=' in line and not line.strip().startswith('#'): k, v = line.split('=', 1) env[k.strip()] = v.strip() env.update({k: v for k, v in os.environ.items() if v}) return env def resolve_llm_config(env: dict): """Read Hermes config to get the active provider's API key, base_url, and model. Priority: 1) Explicit environment overrides for this pipeline (SUB2API / LLM_* / XIAOMI_* / XIAOMI_MIMO_*) 2) Hermes model config (config.yaml) 3) auth.json credential pool 4) Legacy env fallbacks """ import yaml hermes_dir = Path.home() / '.hermes' def first_env(*names: str) -> str: for name in names: val = (env.get(name) or '').strip() if val: return val return '' # Allow this script to be pinned to the current Hermes model config. cfg_path = hermes_dir / 'config.yaml' cfg = {} if cfg_path.exists(): with open(cfg_path) as f: cfg = yaml.safe_load(f) or {} model_cfg = cfg.get('model', {}) or {} provider = (model_cfg.get('provider') or '').strip() base_url = (model_cfg.get('base_url') or '').rstrip('/') model_name = (model_cfg.get('default') or '').strip() # 1) Explicit overrides for this pipeline take precedence, but keep endpoint/key/model # from the same provider family. Mixing SUB2API_API_KEY with XIAOMI_BASE_URL causes # 401 after switching Hermes to a Sub2API model. explicit_api_key = first_env('LLM_API_KEY') explicit_base_url = first_env('LLM_BASE_URL') explicit_model = first_env('LLM_MODEL') if not explicit_api_key: if provider == 'sub2api' or first_env('SUB2API_API_KEY', 'SUB2API_BASE_URL', 'SUB2API_MODEL'): explicit_api_key = first_env('SUB2API_API_KEY') explicit_base_url = first_env('SUB2API_BASE_URL') or base_url explicit_model = first_env('SUB2API_MODEL') or model_name elif first_env('XIAOMI_API_KEY', 'XIAOMI_MIMO_API_KEY', 'XIAOMI_BASE_URL', 'XIAOMI_MIMO_BASE_URL', 'XIAOMI_MODEL', 'XIAOMI_MIMO_MODEL'): explicit_api_key = first_env('XIAOMI_API_KEY', 'XIAOMI_MIMO_API_KEY') explicit_base_url = first_env('XIAOMI_BASE_URL', 'XIAOMI_MIMO_BASE_URL') explicit_model = first_env('XIAOMI_MODEL', 'XIAOMI_MIMO_MODEL') if explicit_base_url: base_url = explicit_base_url.rstrip('/') if explicit_model: model_name = explicit_model provider_def = (cfg.get('providers', {}) or {}).get(provider, {}) or {} if not base_url and provider_def.get('base_url'): base_url = str(provider_def.get('base_url')).rstrip('/') if not explicit_api_key and provider_def.get('key_env'): explicit_api_key = first_env(str(provider_def.get('key_env'))) # Fast fallback chain: if the active provider has no credentials, use a known-good # provider/model from auth.json so the daily cron keeps publishing. fallback_provider = first_env('LLM_FALLBACK_PROVIDER', 'XIAOMI_FALLBACK_PROVIDER') or 'openrouter' api_key = explicit_api_key auth_path = hermes_dir / 'auth.json' if not api_key and auth_path.exists(): with open(auth_path) as f: auth = json.load(f) pool = auth.get('credential_pool', {}) or {} provider_keys = [] if provider: provider_keys.extend([provider, provider.replace('-', '_')]) # Known aliases for this environment. provider_keys.extend(['sub2api', 'xiaomi', 'xiaomi_mimo', 'sensenova']) for pkey in provider_keys: creds = pool.get(pkey, []) if creds: cred = creds[0] source = cred.get('source', '') if source.startswith('env:'): env_var = source[4:] api_key = env.get(env_var, '') or api_key if not api_key: api_key = cred.get('access_token', '') or api_key if not base_url: base_url = (cred.get('base_url') or '').rstrip('/') if not model_name: model_name = cred.get('model', '') or model_name break # 3) Legacy env fallbacks. if not api_key: api_key = first_env('LLM_API_KEY', 'XIAOMI_API_KEY', 'XIAOMI_MIMO_API_KEY', 'OPENROUTER_API_KEY') if not base_url: base_url = first_env('LLM_BASE_URL', 'XIAOMI_BASE_URL', 'XIAOMI_MIMO_BASE_URL', 'OPENROUTER_BASE_URL').rstrip('/') if not model_name: model_name = first_env('LLM_MODEL') or 'mimo-v2.5-pro' if not api_key and fallback_provider and auth_path.exists(): with open(auth_path) as f: auth = json.load(f) pool = auth.get('credential_pool', {}) or {} for pkey in [fallback_provider, fallback_provider.replace('-', '_')]: creds = pool.get(pkey, []) if creds: cred = creds[0] source = cred.get('source', '') if source.startswith('env:'): env_var = source[4:] api_key = env.get(env_var, '') or api_key if not api_key: api_key = cred.get('access_token', '') or api_key if not base_url: base_url = (cred.get('base_url') or '').rstrip('/') if not model_name: model_name = cred.get('model', '') or model_name provider = fallback_provider break if not api_key: raise RuntimeError( f'No API key found for provider "{provider}" or fallback "{fallback_provider}". ' 'Set SUB2API_API_KEY / XIAOMI_API_KEY / LLM_API_KEY or fix ~/.hermes/auth.json' ) if not base_url: raise RuntimeError( f'No base_url found for provider "{provider}" or fallback "{fallback_provider}". ' 'Set SUB2API_BASE_URL / XIAOMI_BASE_URL / LLM_BASE_URL or fix ~/.hermes/auth.json' ) return api_key, base_url, model_name def _try_llm_request(base_url: str, api_key: str, model: str, prompt_text: str, auth_mode: str, api_key_header: str = 'Authorization'): payload = json.dumps({ 'model': model, 'messages': [{'role': 'user', 'content': prompt_text}], 'temperature': 0.2, 'max_tokens': 8000, }, ensure_ascii=False).encode('utf-8') headers = {'Content-Type': 'application/json'} if api_key_header == 'Authorization': headers[api_key_header] = f'Bearer {api_key}' if auth_mode == 'bearer' else api_key else: headers[api_key_header] = api_key req = urllib.request.Request(f'{base_url}/chat/completions', data=payload, headers=headers) with urllib.request.urlopen(req, timeout=600) as r: resp = json.loads(r.read().decode('utf-8')) return resp['choices'][0]['message']['content'].strip() def llm_call(prompt_text: str, env: dict) -> str: api_key, base_url, model = resolve_llm_config(env) # Use a single, explicit path so cron behavior is easy to debug. # The earlier auth-matrix/fallback logic was making failures harder to reason about. payload = json.dumps({ 'model': model, 'messages': [{'role': 'user', 'content': prompt_text}], 'temperature': 0.2, 'max_tokens': 8000, }, ensure_ascii=False).encode('utf-8') req = urllib.request.Request( f'{base_url}/chat/completions', data=payload, headers={'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'}, ) print(f'llm_call request: base_url={base_url}; model={model}', file=sys.stderr) try: with urllib.request.urlopen(req, timeout=600) as r: resp = json.loads(r.read().decode('utf-8')) return resp['choices'][0]['message']['content'].strip() except urllib.error.HTTPError as e: body = '' try: body = e.read().decode('utf-8', 'ignore') except Exception: pass print(f'llm_call failed: HTTP {e.code} {e.reason}; base_url={base_url}; model={model}; body={body[:500]}', file=sys.stderr) raise def _parse_json_from_llm(text: str): """Strip markdown code blocks and extract a JSON object from LLM output.""" text = re.sub(r'^```(?:json)?\s*\n?', '', text) text = re.sub(r'\n?```\s*$', '', text) text = text.strip() m = re.search(r'\{.*\}\s*$', text, re.S) if not m: raise ValueError('LLM 输出中未找到 JSON 对象') raw_json = m.group(0) raw_json = re.sub(r',\s*([}\]])', r'\1', raw_json) return json.loads(raw_json) def _normalize_title(title: str) -> str: """Normalize a title for dedup comparison: strip non-alphanumeric, lowercase.""" return re.sub(r'[^\w\u4e00-\u9fff]+', '', (title or '').lower()) # ─── Stage 0: Script dedup (no LLM) ──────────────────────────────────────── def stage0_script_dedup(raw_items: list) -> list: """Deduplicate using difflib.SequenceMatcher on normalized titles. Similarity > 0.7 means same event; keep the one with longer summary.""" if not raw_items: return [] # Build list of (normalized_title, item) normed = [] for item in raw_items: nt = _normalize_title(item.get('title_raw', '')) if nt and len(nt) >= 3: normed.append((nt, item)) keep = [] # list of (nt, item) to keep for nt, item in normed: merged = False for i, (knt, kitem) in enumerate(keep): ratio = difflib.SequenceMatcher(None, nt, knt).ratio() if ratio > 0.7: # Same event — keep the one with longer summary if len(item.get('summary_raw', '')) > len(kitem.get('summary_raw', '')): keep[i] = (nt, item) merged = True break if not merged: keep.append((nt, item)) return [item for _, item in keep] # ─── Stage 1: LLM semantic dedup ─────────────────────────────────────────── def stage1_llm_dedup(items: list, env: dict): """Use LLM to identify semantic duplicates. Returns (filtered_items, error).""" if not items: return items, None indexed = [] for i, item in enumerate(items): indexed.append({ 'index': i, 'title': item.get('title_raw', '')[:80], 'summary': item.get('summary_raw', '')[:120], }) prompt = ( '以下是AI领域的新闻条目。有些条目虽然措辞不同,但描述的是同一个事件。' '请识别重复项,输出要保留的条目索引列表。只有描述完全相同的具体事件才视为重复。\n\n' f'{json.dumps(indexed, ensure_ascii=False)}\n\n' '请严格按以下JSON格式输出,不要包含任何其他内容:\n' '{"keep_indices": [0, 1, 3, 5]}' ) try: raw = llm_call(prompt, env) obj = _parse_json_from_llm(raw) indices = obj.get('keep_indices', []) if not isinstance(indices, list): raise ValueError('keep_indices is not a list') # Filter valid indices valid = sorted(set(i for i in indices if isinstance(i, int) and 0 <= i < len(items))) if not valid: raise ValueError('No valid indices in keep_indices') return [items[i] for i in valid], None except Exception as e: err = f'stage1_llm_dedup failed: {type(e).__name__}: {e}' print(err) return items, err # Fallback: return all items unchanged # ─── Stage 2a: LLM summary rewrite (parallel) ────────────────────────────── def stage2a_rewrite_summaries(items: list, env: dict): """Rewrite summaries in concise Chinese. Returns (updated_items, error).""" if not items: return items, None indexed = [] for i, item in enumerate(items): indexed.append({ 'index': i, 'title': item.get('title_raw', '')[:80], 'summary': item.get('summary_raw', '')[:200], }) prompt = ( '请将以下新闻条目的标题和摘要改写为简洁中文。' '标题:英文品牌名/模型名保留原样(如GPT-5、Codex),其余翻译为中文。' '摘要:每条最多120字,保留核心事实。\n\n' f'{json.dumps(indexed, ensure_ascii=False)}\n\n' '请严格按以下JSON格式输出:\n' '{"summaries": [{"index": 0, "title": "中文标题", "summary": "改写后的摘要"}, ...]}' ) try: raw = llm_call(prompt, env) obj = _parse_json_from_llm(raw) summaries = obj.get('summaries', []) if not isinstance(summaries, list): raise ValueError('summaries is not a list') result = [dict(item) for item in items] # shallow copy for entry in summaries: idx = entry.get('index') s = entry.get('summary', '') t = entry.get('title', '') if isinstance(idx, int) and 0 <= idx < len(result): if t: result[idx] = dict(result[idx], title_raw=t) if s: result[idx] = dict(result[idx], summary_raw=s) return result, None except Exception as e: err = f'stage2a_rewrite_summaries failed: {type(e).__name__}: {e}' print(err) return items, err # Fallback: return items unchanged # ─── Stage 2b: LLM classify (parallel) ────────────────────────────────────── def stage2b_classify(items: list, env: dict): """Classify each item into a section. Returns (updated_items, error).""" if not items: return items, None indexed = [] for i, item in enumerate(items): indexed.append({ 'index': i, 'title': item.get('title_raw', '')[:80], 'summary': item.get('summary_raw', '')[:120], }) sections_str = '、'.join(SECTION_ORDER) prompt = ( f'请将以下AI新闻条目分类到对应板块。\n' f'可选板块:{sections_str}\n\n' f'{json.dumps(indexed, ensure_ascii=False)}\n\n' '请严格按以下JSON格式输出:\n' '{"sections": [{"index": 0, "section": "模型发布/更新"}, ...]}' ) try: raw = llm_call(prompt, env) obj = _parse_json_from_llm(raw) sections = obj.get('sections', []) if not isinstance(sections, list): raise ValueError('sections is not a list') result = [dict(item) for item in items] # shallow copy for entry in sections: idx = entry.get('index') sec = entry.get('section', '') if isinstance(idx, int) and 0 <= idx < len(result) and sec: if sec in SECTION_ORDER: result[idx] = dict(result[idx], section_hint=sec) return result, None except Exception as e: err = f'stage2b_classify failed: {type(e).__name__}: {e}' print(err) return items, err # Fallback: return items unchanged # ─── Stage 2 parallel execution ───────────────────────────────────────────── def stage2_parallel(items: list, env: dict): """Run stage2a (summary rewrite) and stage2b (classify) in parallel. Returns (merged_items, errors_list).""" errors = [] summaries_result = items classify_result = items with ThreadPoolExecutor(max_workers=2) as executor: future_summaries = executor.submit(stage2a_rewrite_summaries, items, env) future_classify = executor.submit(stage2b_classify, items, env) # Wait for summary rewrite try: summaries_result, err = future_summaries.result() if err: errors.append(err) except Exception as e: errors.append(f'stage2a exception: {type(e).__name__}: {e}') # Wait for classify try: classify_result, err = future_classify.result() if err: errors.append(err) except Exception as e: errors.append(f'stage2b exception: {type(e).__name__}: {e}') # Merge: take summaries from stage2a, sections from stage2b merged = [] for i in range(len(items)): new_item = dict(summaries_result[i]) if i < len(summaries_result) else dict(items[i]) # Apply section from classify result if available if i < len(classify_result) and classify_result[i].get('section_hint'): new_item['section_hint'] = classify_result[i]['section_hint'] merged.append(new_item) return merged, errors # ─── Stage 3: LLM guide/observation ──────────────────────────────────────── def llm_generate_guide(items, today: str, env: dict) -> str: """Generate editorial judgment section: main theme + signals + risk.""" indexed = [] for i, item in enumerate(items, 1): indexed.append({ 'n': i, 'title': item['title'], 'summary': item['summary'][:100], 'section': item['section'], 'source': item.get('source', ''), }) prompt = { 'date': today, 'task': ( '你是AI行业编辑。根据以下已经分类和摘要改写好的条目,写「今日观察」。\n\n' '格式要求:\n' '【主线】blockquote格式,一句话概括今天最值得关注的趋势(不要套话,要具体)\n' '【强信号】2-3条,每条格式:编号. 标题(一句话)+ 一两句说明为什么重要\n' '【中信号】1-2条,格式同上\n' '【待验证】1-2条,格式同上,说明为什么存疑\n\n' '写作要求:\n' '- 不要空泛总结(如"行业焦点转向XX"),要指向具体事件\n' '- 不要引用编号如[1][3],读者看不到对应关系\n' '- 不要建议("开发者应该..."之类删掉)\n' '- 每条控制在2-3句话以内\n' '- 用大白话,不要学术腔\n' ), 'items': indexed, 'rule': '只输出观察文本,不要代码块、不要JSON。严格使用【主线】【强信号】【中信号】【待验证】四个标记。' } query = json.dumps(prompt, ensure_ascii=False) try: text = llm_call(query, env) text = re.sub(r'^```(?:\w+)?\s*\n?', '', text) text = re.sub(r'\n?```\s*$', '', text) text = text.strip().strip('"').strip("'") return text except Exception: return '' # ─── Rendering helpers (unchanged) ────────────────────────────────────────── def _parse_guide_sections(guide: str): """Parse guide text into structured sections by 【markers】.""" sections = {} parts = re.split(r'【(主线|强信号|中信号|待验证|建议)】', guide) i = 1 while i < len(parts) - 1: key = parts[i].strip() content = parts[i + 1].strip() sections[key] = content i += 2 return sections def _make_ref_factory(items): """Create a [N] → link converter bound to the items list.""" def make_ref(m): idx = int(m.group(1)) if 1 <= idx <= len(items): item = items[idx - 1] url = item.get('url', '') if url: return f'[{idx}]' return f'[{idx}]' return m.group(0) return make_ref def _render_guide_section(lines, title, text, items, is_quote=False): """Render a guide section with title on its own line, content below.""" make_ref = _make_ref_factory(items) lines.append(f'**{title}**') lines.append('') for gline in text.split('\n'): gline = gline.strip() if not gline: continue gline = re.sub(r'\[(\d+)\]', make_ref, gline) gline = re.sub(r'\[N\]', '', gline) gline = gline.strip() if not gline: continue if is_quote: lines.append(f'> {gline}') else: lines.append(gline) lines.append('') def format_source_link(item): source = item.get('source') or '来源' url = item.get('url') or '' if url: return f'[{source} ↗]({url})' return source def blog_markdown(items, guide=None): grouped = {k: [] for k in SECTION_ORDER} for item in items: grouped.setdefault(item['section'], []).append(item) n = 1 lines = [] guide_items = guide if isinstance(guide, list) else [] make_ref = _make_ref_factory(items) def clean_guide_text(text): text = re.sub(r'\[\d+\]', '', text) text = re.sub(r'\[N\]', '', text).strip() text = re.sub(r'^主线判断[::]\s*', '', text) text = re.sub(r'\s+', ' ', text).strip() return text # === Top: 导览 (theme only) === theme_items = [g for g in guide_items if g.get('type') == 'theme'] if theme_items: lines.append('## 导览') lines.append('') for g in theme_items: text = clean_guide_text(g.get('text', '')) if text: for para in text.split('\n'): para = para.strip() if para: lines.append(f'> {para}') lines.append('') # === News sections === for sec in SECTION_ORDER: sec_items = grouped.get(sec, []) if not sec_items: continue lines.append(f'## {sec}') lines.append('') for item in sec_items: summary = item['summary'].strip() if len(summary) > 120: summary = summary[:120].rstrip() + '…' source_link = format_source_link(item) if summary and summary[-1] not in '。!?…': summary += '。' lines.append(f'**{n}. {item["title"]}**') lines.append('') lines.append(f'> {summary}{source_link}') lines.append('') n += 1 # === Bottom: 总结 (strong/medium/risk) === type_labels = {'strong': '强信号', 'medium': '中信号', 'risk': '待验证'} summary_types = ['strong', 'medium', 'risk'] summary_items = [g for g in guide_items if g.get('type') in summary_types] if summary_items: lines.append('## 总结') lines.append('') for t in summary_types: type_items = [g for g in summary_items if g.get('type') == t] if not type_items: continue label = type_labels.get(t, t) lines.append(f'**{label}**') lines.append('') for g in type_items: text = clean_guide_text(g.get('text', '')) if not text: continue title_match = re.search(r'^(.+?)[::]\s*', text) if title_match and len(title_match.group(1)) < 60: title = title_match.group(1).strip() content = text[title_match.end():].strip() else: sentences = re.split(r'[。!?]', text) title = sentences[0].strip() if sentences else text[:40] content = text[len(sentences[0]):].strip() if content and content[0] in '。!?': content = content[1:].strip() lines.append(f'- **{title}**') if content: lines.append(f' {content}') lines.append('') return '\n'.join(lines).strip() def short_summary(blog_url): return f'AI日报已发布 👉 {blog_url}' def blog_api_request(method, path, payload=None, token=None, base_url=None): url = base_url.rstrip('/') + path data = None headers = {'Authorization': f'Bearer {token}', 'User-Agent': UA} if payload is not None: data = json.dumps(payload, ensure_ascii=False).encode('utf-8') headers['Content-Type'] = 'application/json' req = urllib.request.Request(url, data=data, headers=headers, method=method) with urllib.request.urlopen(req, timeout=25) as r: return json.loads(r.read().decode('utf-8')) # ─── Main pipeline ────────────────────────────────────────────────────────── def main(): env = load_env() token = env.get('BLOG_SERVICE_TOKEN') or env.get('EPHRON_SERVICE_TOKEN') base_url = env.get('BLOG_API_BASE_URL', 'https://blog.ephron.ren') if not token: print('缺少 blog service token,已停止。') sys.exit(1) errors = [] source_counts = {} raw_items = [] # ── Collect raw items (unchanged) ──────────────────────────────────────── try: aihot_items, raw_daily = parse_aihot(TODAY) raw_items.extend(aihot_items) source_counts['AI HOT'] = len(aihot_items) except urllib.error.HTTPError as e: if e.code == 404: print(f'今天({TODAY})的 AI HOT 完整日报还没有生成,暂不发布。') return raise for name, url in RSS_FEEDS.items(): try: parsed = parse_rss(name, url) raw_items.extend(parsed) source_counts[name] = len(parsed) except Exception as e: errors.append(f'{name}: {type(e).__name__}') source_counts[name] = 0 juya_items = [] try: juya_items = parse_juya(TODAY) except Exception as e: errors.append(f'橘鸦AI早报: {type(e).__name__}') # If juya returned nothing, wait 2 minutes and retry once if not juya_items: print('橘鸦AI早报尚未就绪,等待 2 分钟后重试...') time.sleep(120) try: juya_items = parse_juya(TODAY) except Exception as e: errors.append(f'橘鸦AI早报(重试): {type(e).__name__}') raw_items.extend(juya_items) source_counts['橘鸦AI早报'] = len(juya_items) raw_path = OUT_DIR / 'raw_items.json' raw_path.write_text(json.dumps(raw_items, ensure_ascii=False, indent=2), encoding='utf-8') # ── Stage 0: Script dedup ──────────────────────────────────────────────── print(f'Stage 0: Script dedup — {len(raw_items)} raw items') items = stage0_script_dedup(raw_items) stage0_count = len(items) print(f'Stage 0 done — {stage0_count} unique items') # ── Stage 1: LLM semantic dedup ───────────────────────────────────────── print(f'Stage 1: LLM semantic dedup') items, stage1_err = stage1_llm_dedup(items, env) if stage1_err: errors.append(stage1_err) print(f'Stage 1 done — {len(items)} items') # ── Stage 2: Parallel summary rewrite + classify ──────────────────────── print(f'Stage 2: Parallel summary rewrite + classify') items, stage2_errs = stage2_parallel(items, env) errors.extend(stage2_errs) print(f'Stage 2 done — {len(items)} items') # ── Build final items with title/source fields ────────────────────────── # At this point items still have raw fields; convert to final format final_items = [] seen_titles = set() for item in items: title = clean_text(item.get('title_raw', '')) summary = clean_text(item.get('summary_raw', ''))[:120] if not title: continue norm = _normalize_title(title) if norm in seen_titles: continue seen_titles.add(norm) section = item.get('section_hint', '') or '行业与公司' if section not in SECTION_ORDER: section = '行业与公司' final_items.append({ 'title': title, 'summary': summary or '该条目暂无摘要。', 'section': section, 'url': item.get('url') or '', 'source': item.get('source_label') or item.get('source_group') or '来源', 'source_group': item.get('source_group') or '未知来源', 'dedupe_keys': [norm], }) # ── Stage 3: LLM guide/observation ────────────────────────────────────── print(f'Stage 3: LLM guide generation') guide_text = llm_generate_guide(final_items, TODAY, env) # Parse guide into structured format for blog_markdown guide_structured = [] if guide_text: parsed = _parse_guide_sections(guide_text) type_map = {'主线': 'theme', '强信号': 'strong', '中信号': 'medium', '待验证': 'risk'} for key, text in parsed.items(): guide_type = type_map.get(key, 'theme') if guide_type == 'theme': guide_structured.append({'type': 'theme', 'text': text}) else: # Split into individual items by numbered lines lines = [l.strip() for l in text.split('\n') if l.strip()] for line in lines: # Remove leading number like "1. " line = re.sub(r'^\d+[\.\、]\s*', '', line) if line: guide_structured.append({'type': guide_type, 'text': line}) # ── Stage 4: Assemble and publish ─────────────────────────────────────── print(f'Stage 4: Assemble and publish') md = blog_markdown(final_items, guide_structured) title = f'AI日报 · {TODAY}' tags = ['AI日报', 'AI资讯', '人工智能'] payload = {'title': title, 'content': md, 'tags': tags} dry_run = (env.get('AI_DAILY_DRY_RUN') or '').strip().lower() in ('1', 'true', 'yes') if dry_run: slug = f'dry-run-{TODAY}' blog_url = f'{base_url}/posts/{slug}' public_ok = True print('AI_DAILY_DRY_RUN=1:已完成组装验证,跳过博客创建/发布。') else: create_resp = blog_api_request('POST', '/api/service/posts', payload=payload, token=token, base_url=base_url) slug = create_resp.get('slug') if not slug: print('Blog 草稿创建失败:未返回 slug') sys.exit(1) blog_api_request('POST', f'/api/service/posts/{slug}/publish', token=token, base_url=base_url) blog_url = f'{base_url}/posts/{slug}' public_ok = False try: req = urllib.request.Request(blog_url, headers={'User-Agent': UA}) with urllib.request.urlopen(req, timeout=20) as r: public_ok = getattr(r, 'status', None) == 200 except Exception: public_ok = False msg = short_summary(blog_url) if errors: msg += '\n\n注:部分补充源本次采集失败或LLM阶段出错,已自动降级:' + ';'.join(errors) if not public_ok: msg += '\n\n警告:blog 草稿/发布接口已返回成功,但公开链接暂未验证为 200,请人工复核。' # Build digest for JSON output digest = { 'items': final_items, 'featured_titles': [i['title'] for i in final_items[:6]], 'guide': guide_structured, } (OUT_DIR / 'llm_digest.json').write_text(json.dumps(digest, ensure_ascii=False, indent=2), encoding='utf-8') (OUT_DIR / 'blog_markdown.md').write_text(md, encoding='utf-8') (OUT_DIR / 'chat_summary.txt').write_text(msg, encoding='utf-8') (OUT_DIR / 'run_meta.json').write_text(json.dumps({ 'date': TODAY, 'slug': slug, 'blog_url': blog_url, 'public_ok': public_ok, 'errors': errors, 'aihot_sections': [s.get('label') for s in raw_daily.get('sections', [])], 'raw_item_count': len(raw_items), 'stage0_count': stage0_count, 'final_item_count': len(final_items), 'has_juya': any(i.get('source_group') == '橘鸦AI早报' for i in raw_items), 'source_counts': source_counts, 'featured_titles': digest.get('featured_titles', []), }, ensure_ascii=False, indent=2), encoding='utf-8') print(msg) if __name__ == '__main__': main()