From 5a986962559352fd58e6f2aff6ed4f9416768e43 Mon Sep 17 00:00:00 2001 From: Mimikko-zeus Date: Thu, 4 Jun 2026 15:21:56 +0800 Subject: [PATCH] Refactor AI daily report pipeline --- .gitignore | 9 + ai_daily_report/__init__.py | 2 + ai_daily_report/assemble.py | 77 ++ ai_daily_report/classify.py | 109 ++ ai_daily_report/cli.py | 40 + ai_daily_report/clients.py | 64 + ai_daily_report/collect.py | 95 ++ ai_daily_report/config.py | 19 + ai_daily_report/dedupe.py | 100 ++ ai_daily_report/env.py | 143 +++ ai_daily_report/guide.py | 113 ++ ai_daily_report/llm.py | 18 + ai_daily_report/models.py | 53 + ai_daily_report/normalize.py | 132 ++ ai_daily_report/pipeline.py | 219 ++++ ai_daily_report/publish.py | 90 ++ ai_daily_report/rewrite.py | 103 ++ ai_daily_report/runner.py | 156 +++ ai_daily_report/semantic_dedupe.py | 167 +++ ai_daily_report/sources/__init__.py | 2 + ai_daily_report/sources/aihot.py | 32 + ai_daily_report/sources/juya.py | 58 + ai_daily_report/sources/labels.py | 78 ++ ai_daily_report/sources/registry.py | 24 + ai_daily_report/sources/rss.py | 51 + ai_daily_report/validate.py | 46 + config/pipeline.json | 16 + config/sources.json | 58 + docs/pipeline-optimization-plan.md | 786 ++++++++++++ .../2026-06-04-local-dry-run-foundation.md | 159 +++ script/ai_daily_blog_pipeline.py | 1115 +---------------- script/blog_markdown.md | 198 --- script/run_meta.json | 35 - skill/scripts/.gitkeep | 1 + skill/scripts/run_daily_report.py | 7 + tests/fixtures/.gitkeep | 1 + tests/test_cli.py | 47 + tests/test_clients.py | 47 + tests/test_config_loading.py | 27 + tests/test_dry_run_config.py | 33 + tests/test_env_config.py | 87 ++ tests/test_env_loading.py | 39 + tests/test_legacy_script_delegation.py | 57 + tests/test_llm_utils.py | 17 + tests/test_markdown_rendering.py | 39 + tests/test_project_structure.py | 33 + tests/test_runner.py | 132 ++ tests/test_source_labels.py | 55 + tests/test_stage0_collect.py | 49 + tests/test_stage0_to_2_pipeline.py | 32 + tests/test_stage0_to_4_pipeline.py | 66 + tests/test_stage0_to_5_pipeline.py | 62 + tests/test_stage0_to_6_pipeline.py | 75 ++ tests/test_stage0_to_7_pipeline.py | 76 ++ tests/test_stage0_to_8_pipeline.py | 79 ++ tests/test_stage1_normalize.py | 85 ++ tests/test_stage2_dedupe.py | 63 + tests/test_stage3_semantic_dedupe.py | 129 ++ tests/test_stage4_rewrite.py | 96 ++ tests/test_stage5_classify.py | 61 + tests/test_stage6_guide.py | 77 ++ tests/test_stage7_assemble.py | 65 + tests/test_stage8_publish.py | 76 ++ tests/test_validate.py | 14 + 64 files changed, 4778 insertions(+), 1316 deletions(-) create mode 100644 .gitignore create mode 100644 ai_daily_report/__init__.py create mode 100644 ai_daily_report/assemble.py create mode 100644 ai_daily_report/classify.py create mode 100644 ai_daily_report/cli.py create mode 100644 ai_daily_report/clients.py create mode 100644 ai_daily_report/collect.py create mode 100644 ai_daily_report/config.py create mode 100644 ai_daily_report/dedupe.py create mode 100644 ai_daily_report/env.py create mode 100644 ai_daily_report/guide.py create mode 100644 ai_daily_report/llm.py create mode 100644 ai_daily_report/models.py create mode 100644 ai_daily_report/normalize.py create mode 100644 ai_daily_report/pipeline.py create mode 100644 ai_daily_report/publish.py create mode 100644 ai_daily_report/rewrite.py create mode 100644 ai_daily_report/runner.py create mode 100644 ai_daily_report/semantic_dedupe.py create mode 100644 ai_daily_report/sources/__init__.py create mode 100644 ai_daily_report/sources/aihot.py create mode 100644 ai_daily_report/sources/juya.py create mode 100644 ai_daily_report/sources/labels.py create mode 100644 ai_daily_report/sources/registry.py create mode 100644 ai_daily_report/sources/rss.py create mode 100644 ai_daily_report/validate.py create mode 100644 config/pipeline.json create mode 100644 config/sources.json create mode 100644 docs/pipeline-optimization-plan.md create mode 100644 docs/plans/2026-06-04-local-dry-run-foundation.md delete mode 100644 script/blog_markdown.md delete mode 100644 script/run_meta.json create mode 100644 skill/scripts/.gitkeep create mode 100644 skill/scripts/run_daily_report.py create mode 100644 tests/fixtures/.gitkeep create mode 100644 tests/test_cli.py create mode 100644 tests/test_clients.py create mode 100644 tests/test_config_loading.py create mode 100644 tests/test_dry_run_config.py create mode 100644 tests/test_env_config.py create mode 100644 tests/test_env_loading.py create mode 100644 tests/test_legacy_script_delegation.py create mode 100644 tests/test_llm_utils.py create mode 100644 tests/test_markdown_rendering.py create mode 100644 tests/test_project_structure.py create mode 100644 tests/test_runner.py create mode 100644 tests/test_source_labels.py create mode 100644 tests/test_stage0_collect.py create mode 100644 tests/test_stage0_to_2_pipeline.py create mode 100644 tests/test_stage0_to_4_pipeline.py create mode 100644 tests/test_stage0_to_5_pipeline.py create mode 100644 tests/test_stage0_to_6_pipeline.py create mode 100644 tests/test_stage0_to_7_pipeline.py create mode 100644 tests/test_stage0_to_8_pipeline.py create mode 100644 tests/test_stage1_normalize.py create mode 100644 tests/test_stage2_dedupe.py create mode 100644 tests/test_stage3_semantic_dedupe.py create mode 100644 tests/test_stage4_rewrite.py create mode 100644 tests/test_stage5_classify.py create mode 100644 tests/test_stage6_guide.py create mode 100644 tests/test_stage7_assemble.py create mode 100644 tests/test_stage8_publish.py create mode 100644 tests/test_validate.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1283968 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +.env +.env.* +!.env.example +__pycache__/ +*.py[cod] +.pytest_cache/ +runs/ +runs-*/ +.idea/ diff --git a/ai_daily_report/__init__.py b/ai_daily_report/__init__.py new file mode 100644 index 0000000..5f84311 --- /dev/null +++ b/ai_daily_report/__init__.py @@ -0,0 +1,2 @@ +"""Core package for the AI daily report pipeline.""" + diff --git a/ai_daily_report/assemble.py b/ai_daily_report/assemble.py new file mode 100644 index 0000000..b66e6ea --- /dev/null +++ b/ai_daily_report/assemble.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +import re +from typing import Any + +from .classify import SECTION_ORDER +from .models import NewsItem +from .validate import validate_markdown + + +END_PUNCTUATION = "。!?;.!?;" + + +def _clean_text(text: str) -> str: + value = re.sub(r"^```(?:\w+)?\s*\n?", "", (text or "").strip()) + value = re.sub(r"\n?```\s*$", "", value) + value = re.sub(r"^\s*>\s*", "", value) + value = re.sub(r"\[\d+\]|\[N\]", "", value) + value = re.sub(r"主线判断[::]\s*", "", value) + value = re.sub(r"\s+", " ", value).strip() + return value + + +def _ensure_sentence(text: str) -> str: + value = _clean_text(text) + if value and value[-1] not in END_PUNCTUATION: + value += "。" + return value + + +def _source_link(item: NewsItem) -> str: + source = item.source_label or item.source_group or "来源" + if item.url: + return f"[{source} ↗]({item.url})" + return source + + +def assemble_markdown(items: list[NewsItem], guide: dict[str, Any] | None = None) -> tuple[str, dict[str, Any]]: + guide = guide or {"theme": "", "threads": []} + lines: list[str] = [] + + theme = _clean_text(str(guide.get("theme") or "")) + if theme: + lines.extend(["## 导览", "", f"> {theme}", ""]) + + item_number = 1 + for section in SECTION_ORDER: + section_items = [item for item in items if item.section == section] + if not section_items: + continue + lines.extend([f"## {section}", ""]) + for item in section_items: + title = _clean_text(item.title or item.title_raw) + summary = _ensure_sentence(item.summary or item.summary_raw or "该条目暂无摘要。") + lines.extend( + [ + f"**{item_number}. {title}**", + "", + f"> {summary}{_source_link(item)}", + "", + ] + ) + item_number += 1 + + threads = guide.get("threads", []) or [] + if threads: + lines.extend(["## 今日脉络", ""]) + for thread in threads: + title = _clean_text(str(thread.get("title") or "")) + text = _ensure_sentence(str(thread.get("text") or "")) + if not title or not text: + continue + lines.extend([f"- **{title}**", f" {text}", ""]) + + markdown = "\n".join(lines).strip() + report = validate_markdown(markdown, items) + return markdown, report diff --git a/ai_daily_report/classify.py b/ai_daily_report/classify.py new file mode 100644 index 0000000..4beca1f --- /dev/null +++ b/ai_daily_report/classify.py @@ -0,0 +1,109 @@ +from __future__ import annotations + +from collections import Counter +from typing import Any + +from .models import NewsItem + + +SECTION_ORDER = [ + "模型与能力", + "产品与应用", + "开发与基础设施", + "公司与资本", + "政策与安全", + "论文与研究", + "观点与教程", + "人物与动态", +] + +SECTION_ALIASES = { + "模型发布/更新": "模型与能力", + "产品发布/更新": "产品与应用", + "产品与工具": "产品与应用", + "开发与工程": "开发与基础设施", + "行业动态": "公司与资本", + "行业与公司": "公司与资本", + "论文研究": "论文与研究", + "论文与研究": "论文与研究", + "技巧与观点": "观点与教程", + "观点与教程": "观点与教程", + "人物与花絮": "人物与动态", +} + + +RULES = [ + ("政策与安全", ("监管", "政策", "安全", "风险", "滥用", "攻击", "合规", "版权")), + ("论文与研究", ("论文", "研究", "arxiv", "cvpr", "benchmark", "评测", "实验")), + ("开发与基础设施", ("sdk", "api", "mcp", "kubernetes", "框架", "开源", "github", "部署", "基础设施")), + ("公司与资本", ("融资", "ipo", "上市", "招股书", "合作", "估值", "收购", "资本")), + ("模型与能力", ("模型", "gpt", "claude", "gemini", "grok", "token", "参数", "多模态", "语音", "推理")), + ("产品与应用", ("agent", "应用", "产品", "平台", "上线", "工具", "智能体")), + ("观点与教程", ("教程", "观点", "方法论", "guide", "实践", "技巧")), + ("人物与动态", ("黄仁勋", "纳德拉", "访谈", "演讲", "人物")), +] + + +def normalize_section_hint(section_hint: str) -> str: + hint = (section_hint or "").strip() + if hint in SECTION_ORDER: + return hint + return SECTION_ALIASES.get(hint, "") + + +def rule_classify(item: NewsItem) -> str: + text = f"{item.title or item.title_raw} {item.summary or item.summary_raw}".lower() + for section, keywords in RULES: + if any(keyword.lower() in text for keyword in keywords): + return section + return "公司与资本" + + +def rank_score(item: NewsItem) -> int: + text = f"{item.title or item.title_raw} {item.summary or item.summary_raw}" + score = max(0, 200 - item.source_priority) + if item.source_role == "primary": + score += 10 + if item.canonical_url: + score += 10 + if any(ch.isdigit() for ch in text): + score += 10 + if item.duplicate_sources: + score += min(20, len(item.duplicate_sources) * 5) + score -= len(item.quality_flags) * 10 + return score + + +def classify_and_order_items(items: list[NewsItem]) -> tuple[list[NewsItem], dict[str, Any]]: + hint_classified = 0 + rule_classified = 0 + + for item in items: + mapped = normalize_section_hint(item.section_hint) + if mapped: + item.section = mapped + hint_classified += 1 + else: + item.section = rule_classify(item) + rule_classified += 1 + + section_index = {section: index for index, section in enumerate(SECTION_ORDER)} + ordered = sorted( + items, + key=lambda item: ( + section_index.get(item.section or "", len(SECTION_ORDER)), + -rank_score(item), + item.title or item.title_raw, + ), + ) + section_counts = Counter(item.section for item in ordered if item.section) + report = { + "input_count": len(items), + "section_counts": dict(section_counts), + "hint_classified": hint_classified, + "rule_classified": rule_classified, + "llm_classified": 0, + "fallback_classified": 0, + "invalid_section_count": sum(1 for item in ordered if item.section not in SECTION_ORDER), + } + return ordered, report diff --git a/ai_daily_report/cli.py b/ai_daily_report/cli.py new file mode 100644 index 0000000..539cbce --- /dev/null +++ b/ai_daily_report/cli.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +import argparse +from pathlib import Path + +from .runner import run_daily_report + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(prog="ai-daily-report") + subcommands = parser.add_subparsers(dest="command") + run = subcommands.add_parser("run") + run.add_argument("--date", default="today") + run.add_argument("--mode", choices=["dry-run", "draft", "publish"], default="dry-run") + run.add_argument("--source-mode", choices=["mock", "live"], default="mock") + run.add_argument("--llm-mode", choices=["mock", "live"], default="mock") + run.add_argument("--out-dir", default="runs") + run.add_argument("--base-url", default="https://blog.ephron.ren") + run.add_argument("--sources-path", default=None) + return parser + + +def main(argv: list[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + if args.command == "run": + run_daily_report( + run_date=args.date, + mode=args.mode, + source_mode=args.source_mode, + llm_mode=args.llm_mode, + out_dir=Path(args.out_dir), + base_url=args.base_url, + sources_path=Path(args.sources_path) if args.sources_path else None, + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/ai_daily_report/clients.py b/ai_daily_report/clients.py new file mode 100644 index 0000000..2fd3359 --- /dev/null +++ b/ai_daily_report/clients.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +import json +import urllib.request +from typing import Any + + +UA = "Mozilla/5.0 (compatible; ai-daily-report/1.0)" + + +def fetch_text(url: str, timeout_seconds: int) -> str: + req = urllib.request.Request(url, headers={"User-Agent": UA}) + with urllib.request.urlopen(req, timeout=timeout_seconds) as response: + return response.read().decode("utf-8", "ignore") + + +class OpenAICompatibleClient: + def __init__(self, *, api_key: str, base_url: str, model: str, timeout_seconds: int = 600): + self.api_key = api_key + self.base_url = base_url.rstrip("/") + self.model = model + self.timeout_seconds = timeout_seconds + + def chat(self, prompt: str) -> str: + payload = json.dumps( + { + "model": self.model, + "messages": [{"role": "user", "content": prompt}], + "temperature": 0.2, + "max_tokens": 8000, + }, + ensure_ascii=False, + ).encode("utf-8") + req = urllib.request.Request( + f"{self.base_url}/chat/completions", + data=payload, + headers={"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}, + ) + with urllib.request.urlopen(req, timeout=self.timeout_seconds) as response: + data = json.loads(response.read().decode("utf-8")) + return data["choices"][0]["message"]["content"].strip() + + +class BlogApiClient: + def __init__(self, *, base_url: str, token: str, timeout_seconds: int = 25): + self.base_url = base_url.rstrip("/") + self.token = token + self.timeout_seconds = timeout_seconds + + def _request(self, method: str, path: str, payload: dict[str, Any] | None = None) -> dict[str, Any]: + data = None + headers = {"Authorization": f"Bearer {self.token}", "User-Agent": UA} + if payload is not None: + data = json.dumps(payload, ensure_ascii=False).encode("utf-8") + headers["Content-Type"] = "application/json" + req = urllib.request.Request(f"{self.base_url}{path}", data=data, headers=headers, method=method) + with urllib.request.urlopen(req, timeout=self.timeout_seconds) as response: + return json.loads(response.read().decode("utf-8")) + + def create_post(self, payload: dict[str, Any]) -> dict[str, Any]: + return self._request("POST", "/api/service/posts", payload) + + def publish_post(self, slug: str) -> None: + self._request("POST", f"/api/service/posts/{slug}/publish") diff --git a/ai_daily_report/collect.py b/ai_daily_report/collect.py new file mode 100644 index 0000000..b1c947e --- /dev/null +++ b/ai_daily_report/collect.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime, timezone +from time import perf_counter +from typing import Callable, Iterable, Any + +from .models import SourceConfig, SourceResult + + +Fetcher = Callable[[SourceConfig, str], list[dict[str, Any]]] + + +def _status_from_exception(exc: Exception) -> str: + if isinstance(exc, TimeoutError): + return "timeout" + return "error" + + +def _collect_one(config: SourceConfig, run_date: str, fetcher: Fetcher) -> SourceResult: + fetched_at = datetime.now(timezone.utc).isoformat() + if not config.enabled: + return SourceResult( + source=config.name, + role=config.role, + ok=False, + status="disabled", + fetched_at=fetched_at, + ) + + started = perf_counter() + try: + items = fetcher(config, run_date) + elapsed_ms = int((perf_counter() - started) * 1000) + status = "ok" if items else "empty" + return SourceResult( + source=config.name, + role=config.role, + ok=status == "ok", + status=status, + items=items, + elapsed_ms=elapsed_ms, + fetched_at=fetched_at, + ) + except Exception as exc: + elapsed_ms = int((perf_counter() - started) * 1000) + return SourceResult( + source=config.name, + role=config.role, + ok=False, + status=_status_from_exception(exc), + error=f"{type(exc).__name__}: {exc}", + elapsed_ms=elapsed_ms, + fetched_at=fetched_at, + ) + + +def collect_sources( + configs: Iterable[SourceConfig], + run_date: str, + *, + fetcher: Fetcher, + max_workers: int | None = None, +) -> tuple[list[SourceResult], dict[str, Any]]: + ordered_configs = list(configs) + if not ordered_configs: + return [], { + "input_source_count": 0, + "ok_source_count": 0, + "failed_source_count": 0, + "raw_item_count": 0, + } + + workers = max_workers or min(8, len(ordered_configs)) + result_by_name: dict[str, SourceResult] = {} + + with ThreadPoolExecutor(max_workers=workers) as executor: + futures = { + executor.submit(_collect_one, config, run_date, fetcher): config + for config in ordered_configs + } + for future in as_completed(futures): + config = futures[future] + result_by_name[config.name] = future.result() + + results = [result_by_name[config.name] for config in ordered_configs] + report = { + "input_source_count": len(results), + "ok_source_count": sum(1 for result in results if result.ok), + "failed_source_count": sum(1 for result in results if not result.ok), + "raw_item_count": sum(len(result.items) for result in results), + "source_counts": {result.source: len(result.items) for result in results}, + "statuses": {result.source: result.status for result in results}, + } + return results, report diff --git a/ai_daily_report/config.py b/ai_daily_report/config.py new file mode 100644 index 0000000..03b426d --- /dev/null +++ b/ai_daily_report/config.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +from .models import SourceConfig +from .pipeline import _source_config_from_dict + + +def load_json(path: Path) -> Any: + return json.loads(path.read_text(encoding="utf-8")) + + +def load_source_configs(path: Path) -> list[SourceConfig]: + raw = load_json(path) + if not isinstance(raw, list): + raise ValueError("sources config must be a list") + return [_source_config_from_dict(item) for item in raw] diff --git a/ai_daily_report/dedupe.py b/ai_daily_report/dedupe.py new file mode 100644 index 0000000..6a9e426 --- /dev/null +++ b/ai_daily_report/dedupe.py @@ -0,0 +1,100 @@ +from __future__ import annotations + +import difflib +from typing import Any + +from .models import NewsItem + + +def _item_score(item: NewsItem) -> int: + score = 0 + score += max(0, 200 - item.source_priority) + if item.canonical_url: + score += 20 + if item.summary_raw: + score += min(40, len(item.summary_raw)) + if item.section_hint: + score += 10 + if item.source_role == "primary": + score += 10 + score -= len(item.quality_flags) * 10 + return score + + +def _merge_group(group: list[NewsItem], reason: str) -> tuple[NewsItem, list[NewsItem], dict[str, Any]]: + keep = max(group, key=_item_score) + removed = [item for item in group if item is not keep] + for removed_item in removed: + keep.duplicate_sources.append( + { + "id": removed_item.id, + "source_group": removed_item.source_group, + "source_label": removed_item.source_label, + "url": removed_item.url, + "reason": reason, + } + ) + report_group = { + "reason": reason, + "keep_id": keep.id, + "removed_ids": [item.id for item in removed], + "confidence": "high", + } + return keep, removed, report_group + + +def _group_by_key(items: list[NewsItem], key_name: str) -> dict[str, list[NewsItem]]: + groups: dict[str, list[NewsItem]] = {} + for item in items: + key = getattr(item, key_name) + if key: + groups.setdefault(key, []).append(item) + return {key: group for key, group in groups.items() if len(group) > 1} + + +def _possible_duplicates(items: list[NewsItem]) -> list[dict[str, Any]]: + possible: list[dict[str, Any]] = [] + for index, left in enumerate(items): + for right in items[index + 1 :]: + if not left.title_norm or not right.title_norm: + continue + ratio = difflib.SequenceMatcher(None, left.title_norm, right.title_norm).ratio() + if ratio >= 0.65: + possible.append( + { + "item_ids": [left.id, right.id], + "reason": "title_similarity", + "similarity": round(ratio, 3), + "confidence": "medium", + } + ) + return possible + + +def hard_dedup_items(items: list[NewsItem]) -> tuple[list[NewsItem], dict[str, Any]]: + remaining = list(items) + removed_object_ids: set[int] = set() + groups_report: list[dict[str, Any]] = [] + + for key_name, reason in ( + ("canonical_url", "same_canonical_url"), + ("title_norm", "same_title_norm"), + ): + grouped = _group_by_key([item for item in remaining if id(item) not in removed_object_ids], key_name) + for group in grouped.values(): + active_group = [item for item in group if id(item) not in removed_object_ids] + if len(active_group) < 2: + continue + keep, removed, report_group = _merge_group(active_group, reason) + removed_object_ids.update(id(item) for item in removed) + groups_report.append(report_group) + + deduped = [item for item in remaining if id(item) not in removed_object_ids] + report = { + "input_count": len(items), + "output_count": len(deduped), + "removed_count": len(removed_object_ids), + "groups": groups_report, + "possible_duplicates": _possible_duplicates(deduped), + } + return deduped, report diff --git a/ai_daily_report/env.py b/ai_daily_report/env.py new file mode 100644 index 0000000..a5697f0 --- /dev/null +++ b/ai_daily_report/env.py @@ -0,0 +1,143 @@ +from __future__ import annotations + +import os +import json +from pathlib import Path + + +PROJECT_ROOT = Path(__file__).resolve().parents[1] + + +def read_env_file(env_path: Path) -> dict[str, str]: + env: dict[str, str] = {} + if not env_path.exists(): + return env + text = env_path.read_text(encoding="utf-8", errors="ignore") + for line in text.splitlines(): + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + key, value = line.split("=", 1) + env[key.strip()] = value.strip().strip('"').strip("'") + return env + + +def load_env() -> dict[str, str]: + env: dict[str, str] = {} + env.update(read_env_file(PROJECT_ROOT / ".env")) + env.update(read_env_file(Path.home() / ".hermes" / ".env")) + env.update({key: value for key, value in os.environ.items() if value}) + return env + + +def first_env(env: dict[str, str], *names: str) -> str: + for name in names: + value = (env.get(name) or "").strip() + if value: + return value + return "" + + +def _load_simple_yaml(path: Path) -> dict[str, object]: + if not path.exists(): + return {} + root: dict[str, object] = {} + stack: list[tuple[int, dict[str, object]]] = [(-1, root)] + for raw_line in path.read_text(encoding="utf-8", errors="ignore").splitlines(): + if not raw_line.strip() or raw_line.lstrip().startswith("#") or ":" not in raw_line: + continue + indent = len(raw_line) - len(raw_line.lstrip(" ")) + key, value = raw_line.strip().split(":", 1) + key = key.strip() + value = value.strip().strip('"').strip("'") + while stack and indent <= stack[-1][0]: + stack.pop() + current = stack[-1][1] + if value: + current[key] = value + else: + child: dict[str, object] = {} + current[key] = child + stack.append((indent, child)) + return root + + +def _env_with_hermes(env: dict[str, str], hermes_dir: Path) -> dict[str, str]: + merged = dict(read_env_file(hermes_dir / ".env")) + merged.update(env) + return merged + + +def _provider_env_names(provider: str) -> tuple[str, str, str]: + prefix = provider.upper().replace("-", "_") + return f"{prefix}_API_KEY", f"{prefix}_BASE_URL", f"{prefix}_MODEL" + + +def _auth_json_key(env: dict[str, str], hermes_dir: Path, provider: str) -> str: + auth_path = hermes_dir / "auth.json" + if not auth_path.exists() or not provider: + return "" + try: + auth = json.loads(auth_path.read_text(encoding="utf-8")) + except Exception: + return "" + pool = auth.get("credential_pool", {}) or {} + provider_keys = [provider, provider.replace("-", "_")] + for key in provider_keys: + creds = pool.get(key, []) or [] + if not creds: + continue + cred = creds[0] + source = str(cred.get("source") or "") + if source.startswith("env:"): + resolved = first_env(env, source[4:]) + if resolved: + return resolved + token = str(cred.get("access_token") or "").strip() + if token: + return token + return "" + + +def resolve_llm_config(env: dict[str, str], *, hermes_dir: Path | None = None) -> dict[str, str]: + hermes_dir = hermes_dir or Path.home() / ".hermes" + env = _env_with_hermes(env, hermes_dir) + hermes_config = _load_simple_yaml(hermes_dir / "config.yaml") + model_config = hermes_config.get("model", {}) if isinstance(hermes_config.get("model"), dict) else {} + provider = str(model_config.get("provider") or "").strip() + provider_key, provider_base_url, provider_model = _provider_env_names(provider) if provider else ("", "", "") + + api_key = first_env(env, "LLM_API_KEY") + base_url = first_env(env, "LLM_BASE_URL") + model = first_env(env, "LLM_MODEL") + + if not api_key and provider: + api_key = first_env(env, provider_key) or _auth_json_key(env, hermes_dir, provider) + if not base_url and provider: + base_url = first_env(env, provider_base_url) or str(model_config.get("base_url") or "").strip() + if not model and provider: + model = first_env(env, provider_model) or str(model_config.get("default") or "").strip() + + if not api_key: + api_key = first_env(env, "SUB2API_API_KEY", "XIAOMI_API_KEY", "OPENROUTER_API_KEY") + if not base_url: + base_url = first_env(env, "SUB2API_BASE_URL", "XIAOMI_BASE_URL", "OPENROUTER_BASE_URL") + if not model: + model = first_env(env, "SUB2API_MODEL", "XIAOMI_MODEL") + + missing = [ + name + for name, value in ( + ("LLM_API_KEY", api_key), + ("LLM_BASE_URL", base_url), + ("LLM_MODEL", model), + ) + if not value + ] + if missing: + raise ValueError("missing_llm_config: " + ",".join(missing)) + return {"api_key": api_key, "base_url": base_url, "model": model} + + +def resolve_blog_token(env: dict[str, str]) -> str: + return first_env(env, "BLOG_SERVICE_TOKEN", "EPHRON_SERVICE_TOKEN") diff --git a/ai_daily_report/guide.py b/ai_daily_report/guide.py new file mode 100644 index 0000000..63d8b89 --- /dev/null +++ b/ai_daily_report/guide.py @@ -0,0 +1,113 @@ +from __future__ import annotations + +import json +import re +from typing import Any, Callable + +from .llm import parse_json_object +from .models import NewsItem + + +GuideLlmCall = Callable[[str], str] + + +def _clean_text(text: str, limit: int | None = None) -> str: + value = re.sub(r"^\s*>\s*", "", text or "").strip() + value = re.sub(r"\[\d+\]|\[N\]", "", value) + value = re.sub(r"\s+", " ", value).strip() + if limit and len(value) > limit: + value = value[:limit].rstrip() + return value + + +def _build_prompt(items: list[NewsItem]) -> str: + payload = { + "task": ( + "Generate a concise AI daily report guide. Return JSON only. Do not use 强信号/中信号/待验证. " + "Use a short theme and 2-4 daily threads. Every thread must reference existing item_ids." + ), + "items": [ + { + "id": item.id, + "title": item.title or item.title_raw, + "summary": item.summary or item.summary_raw, + "section": item.section, + "source": item.source_label, + } + for item in items + ], + "output_schema": { + "theme": "one sentence under 120 Chinese characters", + "threads": [ + { + "title": "thread title", + "text": "one or two sentences", + "item_ids": ["existing item id"], + "kind": "thread|uncertain", + } + ], + }, + } + return json.dumps(payload, ensure_ascii=False) + + +def generate_guide( + items: list[NewsItem], + *, + llm_call: GuideLlmCall, +) -> tuple[dict[str, Any], dict[str, Any]]: + if not items: + return { + "theme": "", + "threads": [], + }, { + "input_count": 0, + "theme_present": False, + "thread_count": 0, + "dropped_thread_count": 0, + "fallback_used": False, + "errors": [], + } + + try: + obj = parse_json_object(llm_call(_build_prompt(items))) + except Exception as exc: + return { + "theme": "", + "threads": [], + }, { + "input_count": len(items), + "theme_present": False, + "thread_count": 0, + "dropped_thread_count": 0, + "fallback_used": True, + "errors": [f"{type(exc).__name__}: {exc}"], + } + + valid_ids = {item.id for item in items} + threads: list[dict[str, Any]] = [] + dropped = 0 + for thread in obj.get("threads", []) or []: + item_ids = [item_id for item_id in thread.get("item_ids", []) if item_id in valid_ids] + if not item_ids: + dropped += 1 + continue + title = _clean_text(str(thread.get("title") or ""), limit=80) + text = _clean_text(str(thread.get("text") or ""), limit=220) + if not title or not text: + dropped += 1 + continue + kind = thread.get("kind") if thread.get("kind") in ("thread", "uncertain") else "thread" + threads.append({"title": title, "text": text, "item_ids": item_ids, "kind": kind}) + + theme = _clean_text(str(obj.get("theme") or ""), limit=120) + guide = {"theme": theme, "threads": threads} + report = { + "input_count": len(items), + "theme_present": bool(theme), + "thread_count": len(threads), + "dropped_thread_count": dropped, + "fallback_used": False, + "errors": [], + } + return guide, report diff --git a/ai_daily_report/llm.py b/ai_daily_report/llm.py new file mode 100644 index 0000000..33c8769 --- /dev/null +++ b/ai_daily_report/llm.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +import json +import re +from typing import Any, Callable + + +LlmCall = Callable[[str], str] + + +def parse_json_object(text: str) -> dict[str, Any]: + text = re.sub(r"^```(?:json)?\s*\n?", "", text.strip()) + text = re.sub(r"\n?```\s*$", "", text) + match = re.search(r"\{.*\}\s*$", text, re.S) + if not match: + raise ValueError("LLM output does not contain a JSON object") + return json.loads(match.group(0)) + diff --git a/ai_daily_report/models.py b/ai_daily_report/models.py new file mode 100644 index 0000000..756b629 --- /dev/null +++ b/ai_daily_report/models.py @@ -0,0 +1,53 @@ +from dataclasses import dataclass, field +from typing import Any + + +@dataclass(frozen=True) +class SourceConfig: + name: str + type: str + role: str = "supplement" + priority: int = 100 + required: bool = False + enabled: bool = True + timeout_seconds: int = 25 + retries: int = 0 + min_items: int = 0 + url: str = "" + + +@dataclass +class SourceResult: + source: str + role: str + ok: bool + status: str + items: list[dict[str, Any]] = field(default_factory=list) + error: str | None = None + elapsed_ms: int = 0 + retry_count: int = 0 + fetched_at: str = "" + + +@dataclass +class NewsItem: + id: str + source_group: str + source_label: str + source_role: str + source_priority: int + title_raw: str + title_norm: str + summary_raw: str + url: str + canonical_url: str + published_at: str | None = None + collected_at: str = "" + origin_type: str = "" + section_hint: str = "" + language_hint: str = "" + title: str | None = None + summary: str | None = None + section: str | None = None + quality_flags: list[str] = field(default_factory=list) + duplicate_sources: list[dict[str, Any]] = field(default_factory=list) diff --git a/ai_daily_report/normalize.py b/ai_daily_report/normalize.py new file mode 100644 index 0000000..dda9dd5 --- /dev/null +++ b/ai_daily_report/normalize.py @@ -0,0 +1,132 @@ +from __future__ import annotations + +import hashlib +import html +import re +import unicodedata +from collections import Counter +from datetime import datetime, timezone +from typing import Any +from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse + +from .models import NewsItem, SourceResult + + +TRACKING_QUERY_PREFIXES = ("utm_",) +TRACKING_QUERY_KEYS = {"fbclid", "gclid", "spm", "from", "ref"} + + +def clean_text(value: str) -> str: + text = html.unescape(value or "") + text = re.sub(r"<[^>]+>", " ", text) + text = re.sub(r"\s+", " ", text).strip() + return text + + +def canonicalize_url(url: str) -> str: + if not url: + return "" + parsed = urlparse(url.strip()) + scheme = (parsed.scheme or "https").lower() + host = (parsed.netloc or "").lower() + if host.startswith("www."): + host = host[4:] + if host == "twitter.com": + host = "x.com" + + query = [] + for key, value in parse_qsl(parsed.query, keep_blank_values=True): + key_lower = key.lower() + if key_lower in TRACKING_QUERY_KEYS: + continue + if any(key_lower.startswith(prefix) for prefix in TRACKING_QUERY_PREFIXES): + continue + query.append((key, value)) + + path = parsed.path or "" + if len(path) > 1: + path = path.rstrip("/") + + return urlunparse((scheme, host, path, "", urlencode(query), "")) + + +def normalize_title(title: str) -> str: + text = unicodedata.normalize("NFKC", title or "").lower() + text = re.sub(r"[^\w\u4e00-\u9fff]+", "", text) + return text + + +def _item_id(canonical_url: str, source_group: str, title_norm: str, published_at: str | None) -> str: + seed = canonical_url or "|".join([source_group, title_norm, published_at or ""]) + digest = hashlib.sha1(seed.encode("utf-8")).hexdigest()[:16] + return f"item_{digest}" + + +def _quality_flags(title: str, summary: str, url: str) -> list[str]: + flags: list[str] = [] + if not url: + flags.append("missing_url") + if not summary: + flags.append("missing_summary") + if len(normalize_title(title)) < 3: + flags.append("short_title") + return flags + + +def normalize_items( + source_results: list[SourceResult], + *, + run_date: str, + source_priorities: dict[str, int] | None = None, +) -> tuple[list[NewsItem], dict[str, Any]]: + source_priorities = source_priorities or {} + collected_at = datetime.now(timezone.utc).isoformat() + items: list[NewsItem] = [] + flag_counts: Counter[str] = Counter() + id_counts: Counter[str] = Counter() + input_count = 0 + + for source_result in source_results: + for raw in source_result.items: + input_count += 1 + title = clean_text(str(raw.get("title_raw") or raw.get("title") or "")) + summary = clean_text(str(raw.get("summary_raw") or raw.get("summary") or "")) + url = str(raw.get("url") or "").strip() + canonical_url = canonicalize_url(url) + title_norm = normalize_title(title) + flags = _quality_flags(title, summary, canonical_url) + flag_counts.update(flags) + source_label = clean_text(str(raw.get("source_label") or source_result.source)) + published_at = raw.get("published_at") + base_id = _item_id(canonical_url, source_result.source, title_norm, published_at) + id_counts[base_id] += 1 + item_id = base_id if id_counts[base_id] == 1 else f"{base_id}_{id_counts[base_id]}" + + items.append( + NewsItem( + id=item_id, + source_group=source_result.source, + source_label=source_label, + source_role=source_result.role, + source_priority=source_priorities.get(source_result.source, 100), + title_raw=title, + title_norm=title_norm, + summary_raw=summary, + url=url, + canonical_url=canonical_url, + published_at=published_at, + collected_at=collected_at, + origin_type=str(raw.get("origin_type") or ""), + section_hint=str(raw.get("section_hint") or ""), + language_hint=str(raw.get("language_hint") or ""), + quality_flags=flags, + ) + ) + + report = { + "run_date": run_date, + "input_count": input_count, + "output_count": len(items), + "quality_flag_counts": dict(flag_counts), + } + return items, report diff --git a/ai_daily_report/pipeline.py b/ai_daily_report/pipeline.py new file mode 100644 index 0000000..e2bc8a9 --- /dev/null +++ b/ai_daily_report/pipeline.py @@ -0,0 +1,219 @@ +from __future__ import annotations + +from typing import Any + +from .assemble import assemble_markdown +from .classify import classify_and_order_items +from .collect import Fetcher, collect_sources +from .dedupe import hard_dedup_items +from .guide import GuideLlmCall, generate_guide +from .models import SourceConfig +from .normalize import normalize_items +from .publish import BlogClient, publish_markdown +from .rewrite import RewriteLlmCall, rewrite_items +from .semantic_dedupe import SemanticLlmCall, semantic_dedup_items + + +def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig: + return SourceConfig( + name=value["name"], + type=value["type"], + role=value.get("role", "supplement"), + priority=int(value.get("priority", 100)), + required=bool(value.get("required", False)), + enabled=bool(value.get("enabled", True)), + timeout_seconds=int(value.get("timeout_seconds", 25)), + retries=int(value.get("retries", 0)), + min_items=int(value.get("min_items", 0)), + url=value.get("url", ""), + ) + + +def run_stage0_to_stage2( + source_configs: list[dict[str, Any] | SourceConfig], + run_date: str, + *, + fetcher: Fetcher, +) -> dict[str, Any]: + configs = [ + config if isinstance(config, SourceConfig) else _source_config_from_dict(config) + for config in source_configs + ] + source_results, stage0_report = collect_sources(configs, run_date, fetcher=fetcher) + source_priorities = {config.name: config.priority for config in configs} + normalized_items, stage1_report = normalize_items( + source_results, + run_date=run_date, + source_priorities=source_priorities, + ) + deduped_items, stage2_report = hard_dedup_items(normalized_items) + return { + "source_results": source_results, + "items": deduped_items, + "reports": { + "stage0": stage0_report, + "stage1": stage1_report, + "stage2": stage2_report, + }, + } + + +def run_stage0_to_stage4( + source_configs: list[dict[str, Any] | SourceConfig], + run_date: str, + *, + fetcher: Fetcher, + semantic_llm_call: SemanticLlmCall, + rewrite_llm_call: RewriteLlmCall, +) -> dict[str, Any]: + stage2_result = run_stage0_to_stage2(source_configs, run_date, fetcher=fetcher) + items = stage2_result["items"] + candidates = stage2_result["reports"]["stage2"].get("possible_duplicates", []) + semantic_items, stage3_report = semantic_dedup_items( + items, + candidates, + llm_call=semantic_llm_call, + ) + rewritten_items, stage4_report = rewrite_items( + semantic_items, + llm_call=rewrite_llm_call, + ) + reports = dict(stage2_result["reports"]) + reports["stage3"] = stage3_report + reports["stage4"] = stage4_report + return { + "source_results": stage2_result["source_results"], + "items": rewritten_items, + "reports": reports, + } + + +def run_stage0_to_stage5( + source_configs: list[dict[str, Any] | SourceConfig], + run_date: str, + *, + fetcher: Fetcher, + semantic_llm_call: SemanticLlmCall, + rewrite_llm_call: RewriteLlmCall, +) -> dict[str, Any]: + stage4_result = run_stage0_to_stage4( + source_configs, + run_date, + fetcher=fetcher, + semantic_llm_call=semantic_llm_call, + rewrite_llm_call=rewrite_llm_call, + ) + classified_items, stage5_report = classify_and_order_items(stage4_result["items"]) + reports = dict(stage4_result["reports"]) + reports["stage5"] = stage5_report + return { + "source_results": stage4_result["source_results"], + "items": classified_items, + "reports": reports, + } + + +def run_stage0_to_stage6( + source_configs: list[dict[str, Any] | SourceConfig], + run_date: str, + *, + fetcher: Fetcher, + semantic_llm_call: SemanticLlmCall, + rewrite_llm_call: RewriteLlmCall, + guide_llm_call: GuideLlmCall, +) -> dict[str, Any]: + stage5_result = run_stage0_to_stage5( + source_configs, + run_date, + fetcher=fetcher, + semantic_llm_call=semantic_llm_call, + rewrite_llm_call=rewrite_llm_call, + ) + guide, stage6_report = generate_guide(stage5_result["items"], llm_call=guide_llm_call) + reports = dict(stage5_result["reports"]) + reports["stage6"] = stage6_report + return { + "source_results": stage5_result["source_results"], + "items": stage5_result["items"], + "guide": guide, + "reports": reports, + } + + +def run_stage0_to_stage7( + source_configs: list[dict[str, Any] | SourceConfig], + run_date: str, + *, + fetcher: Fetcher, + semantic_llm_call: SemanticLlmCall, + rewrite_llm_call: RewriteLlmCall, + guide_llm_call: GuideLlmCall, +) -> dict[str, Any]: + stage6_result = run_stage0_to_stage6( + source_configs, + run_date, + fetcher=fetcher, + semantic_llm_call=semantic_llm_call, + rewrite_llm_call=rewrite_llm_call, + guide_llm_call=guide_llm_call, + ) + markdown, stage7_report = assemble_markdown(stage6_result["items"], stage6_result["guide"]) + reports = dict(stage6_result["reports"]) + reports["stage7"] = stage7_report + return { + "source_results": stage6_result["source_results"], + "items": stage6_result["items"], + "guide": stage6_result["guide"], + "markdown": markdown, + "reports": reports, + } + + +def run_stage0_to_stage8( + source_configs: list[dict[str, Any] | SourceConfig], + run_date: str, + *, + fetcher: Fetcher, + semantic_llm_call: SemanticLlmCall, + rewrite_llm_call: RewriteLlmCall, + guide_llm_call: GuideLlmCall, + mode: str, + base_url: str, + client: BlogClient | None, +) -> dict[str, Any]: + stage7_result = run_stage0_to_stage7( + source_configs, + run_date, + fetcher=fetcher, + semantic_llm_call=semantic_llm_call, + rewrite_llm_call=rewrite_llm_call, + guide_llm_call=guide_llm_call, + ) + slug = f"ai-{run_date}" + publish_result = publish_markdown( + title=f"AI日报 · {run_date}", + markdown=stage7_result["markdown"], + tags=["AI日报", "AI资讯", "人工智能"], + slug=slug, + base_url=base_url, + mode=mode, + markdown_report=stage7_result["reports"]["stage7"], + client=client, + ) + reports = dict(stage7_result["reports"]) + reports["stage8"] = { + "mode": publish_result.mode, + "status": publish_result.status, + "slug": publish_result.slug, + "blog_url": publish_result.blog_url, + "public_ok": publish_result.public_ok, + "error": publish_result.error, + } + return { + "source_results": stage7_result["source_results"], + "items": stage7_result["items"], + "guide": stage7_result["guide"], + "markdown": stage7_result["markdown"], + "publish": publish_result, + "reports": reports, + } diff --git a/ai_daily_report/publish.py b/ai_daily_report/publish.py new file mode 100644 index 0000000..7cf3ccd --- /dev/null +++ b/ai_daily_report/publish.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Protocol + + +@dataclass +class PublishResult: + mode: str + status: str + slug: str + blog_url: str + public_ok: bool = False + error: str | None = None + + +class BlogClient(Protocol): + def create_post(self, payload: dict[str, Any]) -> dict[str, Any]: + ... + + def publish_post(self, slug: str) -> None: + ... + + +def dry_run_publish(slug: str, base_url: str) -> PublishResult: + return PublishResult( + mode="dry-run", + status="ok", + slug=slug, + blog_url=f"{base_url.rstrip('/')}/posts/{slug}", + public_ok=True, + ) + + +def publish_markdown( + *, + title: str, + markdown: str, + tags: list[str], + slug: str, + base_url: str, + mode: str, + markdown_report: dict[str, Any], + client: BlogClient | None, +) -> PublishResult: + blocking_errors = markdown_report.get("blocking_errors", []) or [] + blog_url = f"{base_url.rstrip('/')}/posts/{slug}" + if blocking_errors: + return PublishResult( + mode=mode, + status="blocked", + slug=slug, + blog_url=blog_url, + public_ok=False, + error=";".join(blocking_errors), + ) + if mode == "dry-run": + return dry_run_publish(slug, base_url) + if client is None: + return PublishResult( + mode=mode, + status="failed", + slug=slug, + blog_url=blog_url, + public_ok=False, + error="missing_blog_client", + ) + + payload = {"title": title, "content": markdown, "tags": tags, "slug": slug} + try: + create_resp = client.create_post(payload) + created_slug = create_resp.get("slug") or slug + if mode == "publish": + client.publish_post(created_slug) + return PublishResult( + mode=mode, + status="ok", + slug=created_slug, + blog_url=f"{base_url.rstrip('/')}/posts/{created_slug}", + public_ok=mode == "publish", + ) + except Exception as exc: + return PublishResult( + mode=mode, + status="failed", + slug=slug, + blog_url=blog_url, + public_ok=False, + error=f"{type(exc).__name__}: {exc}", + ) diff --git a/ai_daily_report/rewrite.py b/ai_daily_report/rewrite.py new file mode 100644 index 0000000..6bc9063 --- /dev/null +++ b/ai_daily_report/rewrite.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +import json +from typing import Any, Callable + +from .llm import parse_json_object +from .models import NewsItem + + +RewriteLlmCall = Callable[[str], str] + + +def _chunks(items: list[NewsItem], size: int) -> list[list[NewsItem]]: + return [items[index : index + size] for index in range(0, len(items), size)] + + +def _build_prompt(batch: list[NewsItem]) -> str: + payload = { + "task": ( + "Rewrite AI news titles and summaries into concise Chinese. Preserve brand/model/API names " + "such as GPT-5, Codex, Gemini, Claude, API, MCP. Do not add facts." + ), + "items": [ + { + "id": item.id, + "title_raw": item.title_raw, + "summary_raw": item.summary_raw, + "source": item.source_label, + "language_hint": item.language_hint, + } + for item in batch + ], + "output_schema": { + "rewrites": [ + { + "id": "item id", + "title": "display title", + "summary": "display summary", + "flags": [], + } + ] + }, + } + return json.dumps(payload, ensure_ascii=False) + + +def _fallback(item: NewsItem) -> None: + item.title = item.title_raw + item.summary = item.summary_raw or "该条目暂无摘要。" + + +def _apply_rewrite_batch(batch: list[NewsItem], llm_call: RewriteLlmCall) -> int: + obj = parse_json_object(llm_call(_build_prompt(batch))) + rewrites = obj.get("rewrites", []) + if not isinstance(rewrites, list): + raise ValueError("rewrites is not a list") + by_id = {item.id: item for item in batch} + seen_ids: set[str] = set() + for entry in rewrites: + item_id = entry.get("id") + title = str(entry.get("title") or "").strip() + summary = str(entry.get("summary") or "").strip() + if item_id in by_id and title and summary: + by_id[item_id].title = title + by_id[item_id].summary = summary + seen_ids.add(item_id) + for item in batch: + if item.id not in seen_ids: + raise ValueError(f"missing_rewrite_for_item: {item.id}") + return len(seen_ids) + + +def rewrite_items( + items: list[NewsItem], + *, + llm_call: RewriteLlmCall, + batch_size: int = 10, +) -> tuple[list[NewsItem], dict[str, Any]]: + rewritten_count = 0 + fallback_count = 0 + errors: list[str] = [] + + for batch in _chunks(items, max(1, batch_size)): + try: + rewritten_count += _apply_rewrite_batch(batch, llm_call) + except Exception as exc: + errors.append(f"batch:{type(exc).__name__}: {exc}") + for item in batch: + try: + rewritten_count += _apply_rewrite_batch([item], llm_call) + except Exception as item_exc: + errors.append(f"item:{item.id}:{type(item_exc).__name__}: {item_exc}") + _fallback(item) + fallback_count += 1 + + report = { + "input_count": len(items), + "rewritten_count": rewritten_count, + "fallback_count": fallback_count, + "batch_count": len(_chunks(items, max(1, batch_size))), + "errors": errors, + } + return items, report diff --git a/ai_daily_report/runner.py b/ai_daily_report/runner.py new file mode 100644 index 0000000..295316c --- /dev/null +++ b/ai_daily_report/runner.py @@ -0,0 +1,156 @@ +from __future__ import annotations + +import json +from dataclasses import asdict, is_dataclass +from pathlib import Path +from typing import Any + +from .clients import BlogApiClient, OpenAICompatibleClient, fetch_text as default_fetch_text +from .config import load_source_configs +from .env import load_env, resolve_blog_token, resolve_llm_config +from .models import SourceConfig +from .pipeline import run_stage0_to_stage8 +from .sources.registry import get_source_fetcher + + +def _json_default(value: Any): + if is_dataclass(value): + return asdict(value) + raise TypeError(f"Object is not JSON serializable: {type(value).__name__}") + + +def _mock_source_configs() -> list[SourceConfig]: + return [SourceConfig(name="Mock AI HOT", type="mock", role="primary", priority=10)] + + +def _mock_fetcher(config: SourceConfig, run_date: str) -> list[dict[str, Any]]: + return [ + { + "title_raw": "GPT-5 API 发布", + "summary_raw": "OpenAI 发布 GPT-5 API,用于本地 mock 测试。", + "url": "https://example.com/gpt5", + "source_label": "OpenAI:Blog", + "section_hint": "模型发布/更新", + "origin_type": "mock", + "language_hint": "zh", + } + ] + + +def _mock_semantic_llm(prompt: str) -> str: + return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []}, ensure_ascii=False) + + +def _mock_rewrite_llm(prompt: str) -> str: + payload = json.loads(prompt) + return json.dumps( + { + "rewrites": [ + { + "id": item["id"], + "title": item["title_raw"], + "summary": item["summary_raw"], + "flags": [], + } + for item in payload["items"] + ] + }, + ensure_ascii=False, + ) + + +def _mock_guide_llm(prompt: str) -> str: + payload = json.loads(prompt) + item_ids = [item["id"] for item in payload["items"][:3]] + return json.dumps( + { + "theme": "本地 mock 模式已生成 AI 日报,用于验证流水线。", + "threads": [ + { + "title": "本地链路验证", + "text": "采集、改写、分类、导览、Markdown 和发布报告都已通过 mock 数据串联。", + "item_ids": item_ids, + "kind": "thread", + } + ], + }, + ensure_ascii=False, + ) + + +def run_daily_report( + *, + run_date: str, + mode: str, + source_mode: str, + llm_mode: str, + out_dir: Path, + base_url: str, + sources_path: Path | None = None, + fetch_text=None, + env: dict[str, str] | None = None, + llm_client_factory=OpenAICompatibleClient, + blog_client_factory=BlogApiClient, +) -> dict[str, Any]: + fetch_text = fetch_text or default_fetch_text + env = env if env is not None else load_env() + + if source_mode == "mock": + source_configs = _mock_source_configs() + fetcher = _mock_fetcher + elif source_mode == "live": + if sources_path is None: + sources_path = Path("config") / "sources.json" + source_configs = load_source_configs(sources_path) + + def fetcher(config: SourceConfig, current_date: str) -> list[dict[str, Any]]: + source_fetcher = get_source_fetcher(config.type) + return source_fetcher(config, current_date, fetch_text) + + else: + raise ValueError("source_mode must be 'mock' or 'live'") + + if llm_mode == "mock": + semantic_llm_call = _mock_semantic_llm + rewrite_llm_call = _mock_rewrite_llm + guide_llm_call = _mock_guide_llm + elif llm_mode == "live": + llm_client = llm_client_factory(**resolve_llm_config(env)) + semantic_llm_call = llm_client.chat + rewrite_llm_call = llm_client.chat + guide_llm_call = llm_client.chat + else: + raise ValueError("llm_mode must be 'mock' or 'live'") + + blog_client = None + if mode in ("draft", "publish"): + token = resolve_blog_token(env) + if not token: + raise ValueError("missing_blog_token: set BLOG_SERVICE_TOKEN or EPHRON_SERVICE_TOKEN") + blog_client = blog_client_factory(base_url=base_url, token=token) + + result = run_stage0_to_stage8( + source_configs, + run_date, + fetcher=fetcher, + semantic_llm_call=semantic_llm_call, + rewrite_llm_call=rewrite_llm_call, + guide_llm_call=guide_llm_call, + mode=mode, + base_url=base_url, + client=blog_client, + ) + + run_dir = out_dir / run_date + run_dir.mkdir(parents=True, exist_ok=True) + (run_dir / "blog_markdown.md").write_text(result["markdown"], encoding="utf-8") + (run_dir / "run_report.json").write_text( + json.dumps(result["reports"], ensure_ascii=False, indent=2, default=_json_default), + encoding="utf-8", + ) + return { + "run_dir": str(run_dir), + "markdown": result["markdown"], + "reports": result["reports"], + "publish": result["publish"], + } diff --git a/ai_daily_report/semantic_dedupe.py b/ai_daily_report/semantic_dedupe.py new file mode 100644 index 0000000..815d298 --- /dev/null +++ b/ai_daily_report/semantic_dedupe.py @@ -0,0 +1,167 @@ +from __future__ import annotations + +import json +from typing import Any, Callable + +from .llm import parse_json_object +from .models import NewsItem + + +SemanticLlmCall = Callable[[str], str] + + +def _build_prompt(items: list[NewsItem], candidates: list[dict[str, Any]]) -> str: + item_payload = [ + { + "id": item.id, + "title": item.title or item.title_raw, + "summary": item.summary or item.summary_raw, + "source": item.source_label, + "section_hint": item.section_hint, + } + for item in items + ] + prompt = { + "task": "Identify only high-confidence semantic duplicates. Do not curate or remove by importance.", + "items": item_payload, + "candidates": candidates, + "output_schema": { + "duplicate_groups": [ + { + "keep_id": "item id", + "remove_ids": ["item id"], + "confidence": "high|medium|low", + "reason": "same concrete event reason", + } + ], + "not_duplicates": [], + "uncertain": [], + }, + } + return json.dumps(prompt, ensure_ascii=False) + + +def _score(item: NewsItem) -> int: + score = max(0, 200 - item.source_priority) + if item.source_role == "primary": + score += 10 + if item.summary_raw: + score += min(40, len(item.summary_raw)) + if item.canonical_url: + score += 20 + score -= len(item.quality_flags) * 10 + return score + + +def _choose_keep(group_items: list[NewsItem], suggested_keep_id: str) -> NewsItem: + suggested = [item for item in group_items if item.id == suggested_keep_id] + if suggested: + best = max(group_items, key=_score) + if _score(suggested[0]) >= _score(best) - 10: + return suggested[0] + return max(group_items, key=_score) + + +def semantic_dedup_items( + items: list[NewsItem], + candidates: list[dict[str, Any]], + *, + llm_call: SemanticLlmCall, + max_deletion_ratio: float = 0.5, +) -> tuple[list[NewsItem], dict[str, Any]]: + if not items or not candidates: + return items, { + "input_count": len(items), + "candidate_group_count": len(candidates), + "removed_count": 0, + "duplicate_groups": [], + "uncertain": [], + "errors": [], + "skipped_for_deletion_ratio": False, + } + + errors: list[str] = [] + try: + obj = parse_json_object(llm_call(_build_prompt(items, candidates))) + except Exception as exc: + return items, { + "input_count": len(items), + "candidate_group_count": len(candidates), + "removed_count": 0, + "duplicate_groups": [], + "uncertain": [], + "errors": [f"{type(exc).__name__}: {exc}"], + "skipped_for_deletion_ratio": False, + } + + by_id = {item.id: item for item in items} + candidate_sets = { + frozenset(item_id for item_id in candidate.get("item_ids", []) if isinstance(item_id, str)) + for candidate in candidates + } + candidate_removals: set[str] = set() + valid_groups: list[dict[str, Any]] = [] + + for group in obj.get("duplicate_groups", []) or []: + if group.get("confidence") != "high": + continue + ids = [group.get("keep_id")] + list(group.get("remove_ids") or []) + if any(not isinstance(item_id, str) or item_id not in by_id for item_id in ids): + errors.append(f"invalid_ids_in_group: {group}") + continue + group_set = frozenset(ids) + if not any(group_set.issubset(candidate_set) for candidate_set in candidate_sets): + errors.append(f"group_outside_candidates: {group}") + continue + group_items = [by_id[item_id] for item_id in ids] + keep = _choose_keep(group_items, str(group.get("keep_id"))) + remove_items = [item for item in group_items if item is not keep] + candidate_removals.update(item.id for item in remove_items) + valid_groups.append( + { + "keep_id": keep.id, + "remove_ids": [item.id for item in remove_items], + "confidence": "high", + "reason": str(group.get("reason") or "semantic_duplicate"), + } + ) + + deletion_ratio = len(candidate_removals) / len(items) if items else 0 + if deletion_ratio > max_deletion_ratio: + return items, { + "input_count": len(items), + "candidate_group_count": len(candidates), + "removed_count": 0, + "duplicate_groups": valid_groups, + "uncertain": obj.get("uncertain", []) or [], + "errors": errors, + "skipped_for_deletion_ratio": True, + } + + removed_ids: set[str] = set() + for group in valid_groups: + keep = by_id[group["keep_id"]] + for remove_id in group["remove_ids"]: + removed = by_id[remove_id] + keep.duplicate_sources.append( + { + "id": removed.id, + "source_group": removed.source_group, + "source_label": removed.source_label, + "url": removed.url, + "reason": group["reason"], + } + ) + removed_ids.add(remove_id) + + deduped = [item for item in items if item.id not in removed_ids] + report = { + "input_count": len(items), + "candidate_group_count": len(candidates), + "removed_count": len(removed_ids), + "duplicate_groups": valid_groups, + "uncertain": obj.get("uncertain", []) or [], + "errors": errors, + "skipped_for_deletion_ratio": False, + } + return deduped, report diff --git a/ai_daily_report/sources/__init__.py b/ai_daily_report/sources/__init__.py new file mode 100644 index 0000000..54ac9e1 --- /dev/null +++ b/ai_daily_report/sources/__init__.py @@ -0,0 +1,2 @@ +"""Source adapters for the AI daily report pipeline.""" + diff --git a/ai_daily_report/sources/aihot.py b/ai_daily_report/sources/aihot.py new file mode 100644 index 0000000..9c13d55 --- /dev/null +++ b/ai_daily_report/sources/aihot.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +import json +from typing import Any, Callable + +from ai_daily_report.models import SourceConfig + + +FetchText = Callable[[str, int], str] + + +def fetch_aihot(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]: + data = json.loads(fetch_text(f"https://aihot.virxact.com/api/public/daily/{run_date}", config.timeout_seconds)) + items: list[dict[str, Any]] = [] + generated = data.get("generatedAt") + for section in data.get("sections", []) or []: + for raw in section.get("items", []) or []: + items.append( + { + "source_group": config.name, + "source_label": raw.get("sourceName") or config.name, + "title_raw": raw.get("title") or "", + "summary_raw": raw.get("summary") or "", + "url": raw.get("sourceUrl") or "", + "published_at": generated, + "origin_type": "aihot_json", + "section_hint": section.get("label") or "", + "language_hint": "zh", + } + ) + return items + diff --git a/ai_daily_report/sources/juya.py b/ai_daily_report/sources/juya.py new file mode 100644 index 0000000..533fbbf --- /dev/null +++ b/ai_daily_report/sources/juya.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import re +import xml.etree.ElementTree as ET +from typing import Any, Callable + +from ai_daily_report.models import SourceConfig +from ai_daily_report.normalize import clean_text +from ai_daily_report.sources.labels import source_label_from_url + + +FetchText = Callable[[str, int], str] + + +def parse_juya_rss(config: SourceConfig, xml_text: str, run_date: str) -> list[dict[str, Any]]: + root = ET.fromstring(xml_text) + channel = root.find("channel") + raw_items = channel.findall("item") if channel is not None else [] + article_html = "" + for raw in raw_items: + if (raw.findtext("title") or "").strip() != run_date: + continue + content_el = raw.find("{http://purl.org/rss/1.0/modules/content/}encoded") + article_html = content_el.text if content_el is not None and content_el.text else "" + break + if not article_html: + return [] + + block_pattern = re.compile( + r']*>\s*(?:]*href="(?P[^"]+)"[^>]*>)?(?P[^<]*?)?\s*#(?P\d+)\s*(?P.*?)(?=\s*提示|$)', + re.S | re.I, + ) + items: list[dict[str, Any]] = [] + for match in block_pattern.finditer(article_html): + title = clean_text(match.group("title_html") or "") + body_html = match.group("body") or "" + links = re.findall(r']*href="([^"]+)"[^>]*>', body_html, re.I) + url = links[0].replace("&", "&").strip() if links else (match.group("title_url") or "") + summary = clean_text(re.sub(r"<[^>]+>", " ", body_html)) + if title: + items.append( + { + "source_group": config.name, + "source_label": source_label_from_url(url, fallback=config.name), + "title_raw": title, + "summary_raw": summary[:500], + "url": url, + "published_at": None, + "origin_type": "juya_issue", + "section_hint": "", + "language_hint": "zh", + } + ) + return items + + +def fetch_juya(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]: + return parse_juya_rss(config, fetch_text(config.url, config.timeout_seconds), run_date) diff --git a/ai_daily_report/sources/labels.py b/ai_daily_report/sources/labels.py new file mode 100644 index 0000000..26122dc --- /dev/null +++ b/ai_daily_report/sources/labels.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +from urllib.parse import urlparse + + +DOMAIN_LABELS = { + "anthropic.com": "Anthropic", + "arxiv.org": "arXiv", + "bloomberg.com": "Bloomberg", + "deepseek.com": "DeepSeek", + "github.blog": "GitHub Blog", + "github.com": "GitHub", + "huggingface.co": "Hugging Face", + "infoq.com": "InfoQ", + "mp.weixin.qq.com": "微信公众号", + "openai.com": "OpenAI", + "platform.minimaxi.com": "MiniMax:Docs", + "qbitai.com": "量子位", + "techcrunch.com": "TechCrunch", + "technologyreview.com": "MIT科技评论AI", + "theverge.com": "The Verge", + "x.com": "X", + "twitter.com": "X", +} + +X_DISPLAY_NAMES = { + "MiniMax_AI": "MiniMax", + "OpenAIDevs": "OpenAI Developers", + "openai": "OpenAI", + "openclaw": "OpenClaw", + "xai": "xAI", + "krea_ai": "Krea AI", + "nvidia": "NVIDIA", + "NVIDIAAI": "NVIDIA AI", + "alibaba_cloud": "阿里云 / Alibaba Cloud", + "cb_doge": "cb_doge", +} + + +def _host(url: str) -> str: + host = (urlparse(url).netloc or "").lower() + return host[4:] if host.startswith("www.") else host + + +def _domain_label(host: str) -> str: + for domain, label in DOMAIN_LABELS.items(): + if host == domain or host.endswith("." + domain): + return label + return host + + +def _x_handle(url: str) -> str: + parts = [part for part in urlparse(url).path.split("/") if part] + if not parts: + return "" + handle = parts[0] + if handle in {"i", "search", "explore", "settings", "notifications", "home", "compose"}: + return "" + return handle + + +def source_label_from_url(url: str, *, fallback: str = "来源") -> str: + if not url: + return fallback + host = _host(url) + if host in {"x.com", "twitter.com"}: + handle = _x_handle(url) + if handle: + display = X_DISPLAY_NAMES.get(handle, handle) + return f"X:{display} (@{handle})" + return "X" + + label = _domain_label(host) + parsed = urlparse(url) + path = (parsed.path or "").lower() + if label and ("blog" in host or "/blog" in path or "/research" in path): + return f"{label}:Blog" + return label or fallback diff --git a/ai_daily_report/sources/registry.py b/ai_daily_report/sources/registry.py new file mode 100644 index 0000000..56245a9 --- /dev/null +++ b/ai_daily_report/sources/registry.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from typing import Callable + +from ai_daily_report.models import SourceConfig +from ai_daily_report.sources.aihot import fetch_aihot +from ai_daily_report.sources.juya import fetch_juya +from ai_daily_report.sources.rss import fetch_rss + + +SourceFetcher = Callable[[SourceConfig, str, Callable[[str, int], str]], list[dict]] + +SOURCE_FETCHERS: dict[str, SourceFetcher] = { + "aihot": fetch_aihot, + "rss": fetch_rss, + "juya_rss": fetch_juya, +} + + +def get_source_fetcher(source_type: str) -> SourceFetcher: + if source_type not in SOURCE_FETCHERS: + raise KeyError(f"Unknown source type: {source_type}") + return SOURCE_FETCHERS[source_type] + diff --git a/ai_daily_report/sources/rss.py b/ai_daily_report/sources/rss.py new file mode 100644 index 0000000..1a705f6 --- /dev/null +++ b/ai_daily_report/sources/rss.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +import xml.etree.ElementTree as ET +from email.utils import parsedate_to_datetime +from typing import Any, Callable + +from ai_daily_report.models import SourceConfig +from ai_daily_report.normalize import clean_text + + +FetchText = Callable[[str, int], str] + + +def _parse_pubdate(value: str) -> str | None: + if not value: + return None + try: + return parsedate_to_datetime(value).isoformat() + except Exception: + return None + + +def parse_rss_items(config: SourceConfig, xml_text: str, *, limit: int = 20) -> list[dict[str, Any]]: + root = ET.fromstring(xml_text) + channel = root.find("channel") + raw_items = channel.findall("item") if channel is not None else [] + items: list[dict[str, Any]] = [] + for raw in raw_items[:limit]: + title = clean_text(raw.findtext("title") or "") + if not title: + continue + summary = clean_text(raw.findtext("description") or "") + items.append( + { + "source_group": config.name, + "source_label": config.name, + "title_raw": title, + "summary_raw": summary, + "url": (raw.findtext("link") or "").strip(), + "published_at": _parse_pubdate(raw.findtext("pubDate") or ""), + "origin_type": "rss", + "section_hint": "", + "language_hint": "en" if title.encode("utf-8").isascii() else "zh", + } + ) + return items + + +def fetch_rss(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]: + return parse_rss_items(config, fetch_text(config.url, config.timeout_seconds)) + diff --git a/ai_daily_report/validate.py b/ai_daily_report/validate.py new file mode 100644 index 0000000..e7b3959 --- /dev/null +++ b/ai_daily_report/validate.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +import re +from typing import Any + +from .classify import SECTION_ORDER +from .models import NewsItem + + +def validate_report_markdown(markdown: str, items: list[NewsItem]) -> dict[str, Any]: + return validate_markdown(markdown, items) + + +def validate_markdown(markdown: str, items: list[NewsItem]) -> dict[str, Any]: + blocking_errors: list[str] = [] + auto_fixes: list[str] = [] + warnings: list[dict[str, str]] = [] + + if not items: + blocking_errors.append("no_items") + if len((markdown or "").strip()) < 80: + blocking_errors.append("markdown_too_short") + if items and "## " not in markdown: + blocking_errors.append("no_sections") + if re.search(r"\{[^{}]*\}", markdown or ""): + blocking_errors.append("json_fragment_detected") + if "> >" in (markdown or ""): + auto_fixes.append("double_blockquote_detected") + if re.search(r"\[\d+\]|\[N\]", markdown or ""): + auto_fixes.append("reference_marker_detected") + + for item in items: + if not item.url: + warnings.append({"type": "missing_url", "item_id": item.id}) + if item.section not in SECTION_ORDER: + blocking_errors.append("invalid_section") + break + + return { + "item_count": len(items), + "section_count": len({item.section for item in items if item.section}), + "markdown_length": len(markdown or ""), + "auto_fixes": auto_fixes, + "warnings": warnings, + "blocking_errors": blocking_errors, + } diff --git a/config/pipeline.json b/config/pipeline.json new file mode 100644 index 0000000..427507a --- /dev/null +++ b/config/pipeline.json @@ -0,0 +1,16 @@ +{ + "sections": [ + "模型与能力", + "产品与应用", + "开发与基础设施", + "公司与资本", + "政策与安全", + "论文与研究", + "观点与教程", + "人物与动态" + ], + "rewrite_batch_size": 10, + "semantic_dedup_max_deletion_ratio": 0.5, + "default_mode": "dry-run" +} + diff --git a/config/sources.json b/config/sources.json new file mode 100644 index 0000000..f72725a --- /dev/null +++ b/config/sources.json @@ -0,0 +1,58 @@ +[ + { + "name": "AI HOT", + "type": "aihot", + "role": "primary", + "required": true, + "priority": 10, + "timeout_seconds": 25, + "retries": 2, + "min_items": 10, + "enabled": true + }, + { + "name": "InfoQ AI", + "type": "rss", + "url": "https://feed.infoq.com/ai-ml-data-eng/", + "role": "supplement", + "required": false, + "priority": 40, + "timeout_seconds": 25, + "retries": 1, + "enabled": true + }, + { + "name": "MIT科技评论AI", + "type": "rss", + "url": "https://www.technologyreview.com/topic/artificial-intelligence/feed", + "role": "supplement", + "required": false, + "priority": 50, + "timeout_seconds": 25, + "retries": 1, + "enabled": true + }, + { + "name": "量子位", + "type": "rss", + "url": "https://www.qbitai.com/feed", + "role": "supplement", + "required": false, + "priority": 30, + "timeout_seconds": 25, + "retries": 1, + "enabled": true + }, + { + "name": "橘鸦AI早报", + "type": "juya_rss", + "url": "https://imjuya.github.io/juya-ai-daily/rss.xml", + "role": "supplement", + "required": false, + "priority": 20, + "timeout_seconds": 45, + "retries": 2, + "enabled": true + } +] + diff --git a/docs/pipeline-optimization-plan.md b/docs/pipeline-optimization-plan.md new file mode 100644 index 0000000..00158bb --- /dev/null +++ b/docs/pipeline-optimization-plan.md @@ -0,0 +1,786 @@ +# AI Daily Report Pipeline Optimization Plan + +## Objective + +This project should become a stable, long-running AI daily report system for Hermes, OpenClaw, and similar agents. The goal is not only to keep the current script runnable, but to make the whole pipeline observable, replayable, maintainable, and safe to run on a daily schedule. + +The recommended direction is: + +```text +stable core library + CLI + skill wrapper +``` + +Core business logic should live in deterministic code. The skill should describe how agents run, diagnose, replay, publish, and extend the pipeline. + +## Stage Model + +Use this stage model going forward: + +```text +Stage 0: Collect Sources +Stage 1: Normalize Items +Stage 2: Hard Dedup +Stage 3: Semantic Dedup +Stage 4: Rewrite Titles and Summaries +Stage 5: Classify and Order +Stage 6: Guide and Daily Threads +Stage 7: Assemble and Validate Markdown +Stage 8: Publish and Deliver +``` + +The current script names script-level deduplication as Stage 0. That should be treated as old terminology. In the long-term pipeline, the first stage is source collection. + +## Architecture + +Recommended structure: + +```text +ai-daily-report/ +├── ai_daily_report/ +│ ├── models.py +│ ├── sources/ +│ │ ├── aihot.py +│ │ ├── rss.py +│ │ ├── juya.py +│ │ └── registry.py +│ ├── collect.py +│ ├── normalize.py +│ ├── dedupe.py +│ ├── llm.py +│ ├── rewrite.py +│ ├── classify.py +│ ├── assemble.py +│ ├── validate.py +│ ├── publish.py +│ └── cli.py +├── config/ +│ ├── sources.json +│ └── pipeline.json +├── docs/ +├── skill/ +│ ├── SKILL.md +│ ├── scripts/ +│ └── references/ +├── tests/ +│ └── fixtures/ +└── script/ + └── ai_daily_blog_pipeline.py +``` + +Keep `script/ai_daily_blog_pipeline.py` as a compatibility entrypoint during migration, but move implementation into importable modules. + +## Data Model + +### SourceResult + +Every data source should return a structured result: + +```json +{ + "source": "AI HOT", + "role": "primary", + "ok": true, + "status": "ok", + "items": [], + "error": null, + "elapsed_ms": 820, + "retry_count": 0, + "fetched_at": "2026-06-04T10:00:00+08:00" +} +``` + +Supported statuses: + +```text +ok +empty +not_ready +timeout +http_error +parse_error +disabled +``` + +### NewsItem + +All raw source items should be normalized into one structure: + +```json +{ + "id": "item_...", + "source_group": "AI HOT", + "source_label": "OpenAI: Blog", + "source_role": "primary", + "source_priority": 10, + "title_raw": "...", + "title_norm": "...", + "summary_raw": "...", + "title": null, + "summary": null, + "url": "...", + "canonical_url": "...", + "published_at": "...", + "collected_at": "...", + "origin_type": "aihot_json", + "section_hint": "...", + "section": null, + "language_hint": "zh", + "quality_flags": [], + "duplicate_sources": [] +} +``` + +Do not overwrite raw fields with LLM output. Keep display fields separate. + +## Stage 0: Collect Sources + +### Goal + +Collect candidate news from all configured sources in a stable, observable, and recoverable way. + +### Design + +Use a primary-plus-supplement model at the quality layer, and parallel execution at the scheduling layer. + +```text +Quality layer: +AI HOT = primary source +RSS / Juya / InfoQ / QbitAI / MIT = supplement sources + +Execution layer: +start all sources concurrently with per-source timeout, retry, and reporting +``` + +### Source Config + +Example: + +```json +{ + "name": "AI HOT", + "type": "aihot", + "role": "primary", + "required": true, + "priority": 10, + "timeout_seconds": 20, + "retries": 2, + "min_items": 10, + "enabled": true +} +``` + +Supplement source example: + +```json +{ + "name": "Juya AI Daily", + "type": "juya_rss", + "url": "https://imjuya.github.io/juya-ai-daily/rss.xml", + "role": "supplement", + "required": false, + "priority": 20, + "timeout_seconds": 45, + "retries": 2, + "enabled": true +} +``` + +### Optimizations + +- Run supplement sources concurrently. +- Do not let one slow source block the whole pipeline. +- Replace the fixed Juya `sleep(120)` with bounded short retries and a clear `not_ready` or `timeout` status. +- Treat AI HOT 404 as "not ready" rather than a generic failure. +- Allow degraded generation if the primary source has a temporary network failure and supplement sources are usable. +- Persist raw source results for replay. + +### Artifacts + +```text +source_results.json +raw_items.json +stage0_collect_report.json +``` + +## Stage 1: Normalize Items + +### Goal + +Convert heterogeneous source output into clean, comparable, traceable `NewsItem` objects. + +### Optimizations + +- Normalize text with HTML stripping, entity decoding, whitespace cleanup, and RSS boilerplate removal. +- Generate stable `id` values from canonical URL when possible, otherwise from source, normalized title, and date. +- Canonicalize URLs: + - Lowercase scheme and host. + - Remove `utm_*`, `fbclid`, `gclid`, `spm`, `from`, and fragments. + - Normalize trailing slashes. + - Normalize `twitter.com` and `x.com` URLs. +- Generate `title_norm`: + - Unicode NFKC normalization. + - Lowercase English text. + - Normalize whitespace and weak punctuation. + - Preserve numbers, versions, model names, and product names. +- Standardize source labels: + - X links as `X:@username`. + - Official blogs as `OpenAI: Blog`, `Google Research: Blog`, etc. + - Avoid generic labels such as "technology media" when a domain label is available. +- Add `quality_flags` instead of silently dropping items: + - `missing_url` + - `missing_summary` + - `short_title` + - `bad_url` + - `old_item` + - `parse_suspect` + +### Non-goals + +- Do not dedupe. +- Do not rewrite content. +- Do not call the LLM. +- Do not remove items based on importance. + +### Artifacts + +```text +normalized_items.json +stage1_normalize_report.json +``` + +## Stage 2: Hard Dedup + +### Goal + +Remove only high-confidence duplicates with deterministic rules. Mark uncertain similarities for Stage 3. + +### Rules + +High-confidence removal: + +- Same canonical URL. +- Same normalized title. +- Same platform entity, such as the same X status ID. +- Same source and same exact normalized title. + +Uncertain cases: + +- Similar title but different URL. +- Same company or model, but unclear whether the event is identical. +- Same topic across multiple sources with different factual details. + +Uncertain cases should go to `possible_duplicates`, not be removed. + +### Replacement for Current Logic + +The current `SequenceMatcher > 0.7` direct deletion is too risky. Replace it with: + +- Exact deterministic deletion. +- Similarity-based candidate marking only. + +### Keep Item Selection + +When merging a duplicate group, choose the item with a local score: + +```text +official source bonus ++ primary source bonus ++ source priority ++ has URL ++ has summary ++ has section hint ++ newer published_at +- quality flag penalty +``` + +Attach removed items to `duplicate_sources` on the kept item. + +### Artifacts + +```text +deduped_items.json +stage2_dedupe_report.json +``` + +## Stage 3: Semantic Dedup + +### Goal + +Use the LLM to identify semantic duplicates that deterministic rules cannot safely remove. + +### Principles + +- The LLM judges duplicate candidates; local code enforces safety. +- The LLM must not select, curate, or remove items by importance. +- Only remove `confidence = high` duplicate groups. +- Treat medium or uncertain results as non-removal. + +### Input + +Prefer candidate groups from Stage 2. Avoid sending all items at once unless the item count is small. + +Example item payload: + +```json +{ + "id": "item_123", + "title": "...", + "summary": "...", + "source": "QbitAI", + "url_host": "qbitai.com", + "published_at": "...", + "section_hint": "Company and Capital" +} +``` + +### Output Schema + +```json +{ + "duplicate_groups": [ + { + "keep_id": "item_123", + "remove_ids": ["item_456"], + "confidence": "high", + "reason": "Both items report the same concrete event." + } + ], + "not_duplicates": [], + "uncertain": [] +} +``` + +### Safety Checks + +- Validate all IDs exist. +- Validate confidence values. +- Apply local keep-item scoring instead of blindly trusting `keep_id`. +- Skip deletion if the deletion ratio exceeds a configured threshold. +- Skip deletion when versions, product names, or dates conflict. + +### Failure Behavior + +If timeout, JSON parse failure, or schema validation failure occurs, keep Stage 2 output and continue. + +### Artifacts + +```text +semantic_dedup_input.json +semantic_dedup_output.json +stage3_semantic_dedup_report.json +``` + +## Stage 4: Rewrite Titles and Summaries + +### Goal + +Produce concise, accurate Chinese display titles and summaries. + +### Rules + +- Keep `title_raw` and `summary_raw` unchanged. +- Write display fields to `title` and `summary`. +- Preserve brand names, model names, API names, and common technical acronyms in English. +- Translate the rest into natural Chinese. +- Avoid marketing words such as "heavyweight", "explosive", or "just now" unless they are factual and necessary. +- Summaries should be factual, concise, and usually 80-140 Chinese characters. +- Do not add facts not present in the raw title or summary. +- Do not write advice or commentary. + +### Batch Strategy + +- Process 8-12 items per batch. +- Allow limited parallel batches. +- Retry a failed batch once. +- Fall back per item or per batch if needed. + +### Validation + +Check: + +- Non-empty title and summary. +- No markdown links in title. +- No URL in summary. +- No `[N]` or reference markers. +- No emoji. +- Summary length under limit. +- Key numbers, versions, and model names are preserved when present in raw input. + +### Artifacts + +```text +rewritten_items.json +rewrite_llm_outputs.json +stage4_rewrite_report.json +``` + +## Stage 5: Classify and Order + +### Goal + +Place each item into a stable section and order items for readable scanning. + +### Recommended Sections + +Use a fixed section whitelist: + +```text +模型与能力 +产品与应用 +开发与基础设施 +公司与资本 +政策与安全 +论文与研究 +观点与教程 +人物与动态 +``` + +Hide empty sections. Do not create dynamic section names. + +### Classification Strategy + +Use a three-layer approach: + +1. Source hint mapping. +2. Local rule fallback. +3. LLM classification for ambiguous items only. + +Example alias mapping: + +```text +模型发布/更新 -> 模型与能力 +产品发布/更新 -> 产品与应用 +产品与工具 -> 产品与应用 +开发与工程 -> 开发与基础设施 +行业动态 -> 公司与资本 +行业与公司 -> 公司与资本 +论文研究 -> 论文与研究 +技巧与观点 -> 观点与教程 +人物与花絮 -> 人物与动态 +``` + +### Ordering Strategy + +Do not let the LLM freely order all items. Use local scoring: + +```text +rank_score = + source priority + + official source bonus + + primary source bonus + + recency score + + key metric bonus + + duplicate source bonus + - quality flag penalty +``` + +Ordering is for readability only. It must not remove items. + +### Artifacts + +```text +classified_items.json +stage5_classify_order_report.json +``` + +## Stage 6: Guide and Daily Threads + +### Goal + +Generate a concise top guide and a bottom "daily threads" section that helps readers understand the day's shape without turning the report into an investment memo. + +### Replace Current Summary Style + +Do not use: + +```text +强信号 / 中信号 / 待验证 +``` + +This style feels too much like an industry rating or investment brief. + +Use: + +```text +导览 +今日脉络 +仍待确认, when needed +``` + +### Output Schema + +The LLM should output structured JSON, not Markdown: + +```json +{ + "theme": "One concise daily theme.", + "threads": [ + { + "title": "模型能力继续向长上下文、实时语音、多模态生成推进", + "text": "MiniMax M3、Miso One、Ideogram v4.0 分别从长上下文解码、语音克隆和图像生成质量上更新能力边界。", + "item_ids": ["item_1", "item_2", "item_3"], + "kind": "thread" + }, + { + "title": "仍待确认", + "text": "融资传闻、排行榜和单源爆料类消息需要等待官方或更多来源确认。", + "item_ids": ["item_8"], + "kind": "uncertain" + } + ] +} +``` + +### Rules + +- Theme should be one paragraph under 120 Chinese characters. +- Threads should be 2-4 items. +- Each thread must bind to existing `item_ids`. +- Do not add facts absent from the item list. +- Do not write advice. +- Do not include reference numbers. +- Do not include Markdown blockquote syntax. Stage 7 will render Markdown. + +### Failure Behavior + +- If theme generation fails, omit the guide or use a conservative fallback. +- If threads fail, omit `今日脉络`. +- Invalid thread IDs should drop that thread. + +### Artifacts + +```text +guide_input.json +guide_output.json +stage6_guide_report.json +``` + +## Stage 7: Assemble and Validate Markdown + +### Goal + +Render final Markdown deterministically and validate it before publishing. + +### Recommended Structure + +```markdown +## 导览 + +> 一句话主线。 + +## 模型与能力 + +**1. 新闻标题** + +> 新闻摘要。[来源 ↗](https://example.com) + +## 今日脉络 + +- **主题** + 说明... +``` + +### Rendering Rules + +- Render Markdown in code only. +- Use global continuous numbering. +- Hide empty sections. +- Add blockquote syntax for the guide in code. +- Strip any leading `>` from LLM-provided theme text before rendering. +- Use source links consistently: + +```markdown +[OpenAI: Blog ↗](https://example.com) +``` + +If URL is unavailable, render the source label without a link. + +### Auto-fixes + +- Remove `> >`. +- Remove `[N]` and numeric reference markers. +- Remove code fences from guide/thread text. +- Normalize extra blank lines. +- Add missing Chinese punctuation to summaries. +- Remove `主线判断:` prefixes if present. + +### Blocking Checks + +Block publish or downgrade to draft when: + +- Item count is zero. +- No sections are rendered. +- Markdown is abnormally short. +- Section name is outside the whitelist. +- JSON fragments remain in Markdown. +- Link formatting is broadly broken. +- Forbidden advisory language appears in guide/thread text. + +### Artifacts + +```text +blog_markdown.md +stage7_markdown_report.json +``` + +## Stage 8: Publish and Deliver + +### Goal + +Publish only validated Markdown, verify the public page, and make the operation idempotent and recoverable. + +### Modes + +```text +dry-run +draft +publish +``` + +### Requirements + +- Do not publish when Stage 7 has blocking errors. +- Use a deterministic slug such as `ai-YYYY-MM-DD`. +- Check whether the slug already exists before creating a new post. +- Support existence strategies: + - `skip` + - `update-draft` + - `replace` + - `republish` +- Verify the public URL with retries. +- Preserve Markdown and reports when publishing fails. +- Support publishing from an existing run directory. + +### Artifacts + +```text +stage8_publish_report.json +run_report.json +``` + +## Run Directory + +Every run should write to an isolated directory: + +```text +runs/2026-06-04/ + source_results.json + raw_items.json + stage0_collect_report.json + normalized_items.json + stage1_normalize_report.json + deduped_items.json + stage2_dedupe_report.json + semantic_dedup_output.json + stage3_semantic_dedup_report.json + rewritten_items.json + stage4_rewrite_report.json + classified_items.json + stage5_classify_order_report.json + guide_output.json + stage6_guide_report.json + blog_markdown.md + stage7_markdown_report.json + stage8_publish_report.json + run_report.json +``` + +This makes the pipeline replayable and debuggable. + +## CLI + +Provide agent-friendly commands: + +```bash +ai-daily-report run --date today --mode publish +ai-daily-report run --date today --mode dry-run +ai-daily-report run --date 2026-06-04 --mode draft +ai-daily-report replay --run-id 2026-06-04 --from-stage 4 +ai-daily-report publish --from-run 2026-06-04 +ai-daily-report status --date 2026-06-04 +``` + +The current cron can keep invoking the compatibility script, which should delegate to the CLI. + +## Skill Strategy + +Create or update an `ai-daily-report` skill for Hermes/OpenClaw. The skill should not contain business logic. It should provide: + +- How to run daily generation. +- How to dry-run. +- How to replay from an existing run. +- How to publish already generated Markdown. +- How to diagnose source, LLM, Markdown, or publish failures. +- How to add a new RSS source. +- How to adjust output style without breaking the pipeline. + +Suggested skill references: + +```text +skill/references/sources.md +skill/references/output-style.md +skill/references/troubleshooting.md +skill/references/llm-config.md +``` + +## Testing + +Add fixtures and tests for: + +- AI HOT sample parsing. +- RSS parsing. +- Juya `content:encoded` parsing. +- URL canonicalization. +- Title normalization. +- Deterministic deduplication. +- LLM JSON schema validation. +- Rewrite output validation. +- Section alias mapping. +- Markdown rendering. +- Markdown validation. +- Publish dry-run behavior. + +Start with local fixture tests. They will give most of the stability benefit without needing live network calls. + +## Migration Plan + +### Phase 1: Stabilize Current Script + +- Add run directories. +- Add SourceResult and stage reports. +- Add URL canonicalization. +- Replace risky Stage 0 dedupe with hard dedup. +- Add Markdown validation and auto-fixes. + +### Phase 2: Improve Quality + +- Add semantic dedup schema and safety checks. +- Batch rewrite title and summary. +- Add section alias mapping and rule-first classification. +- Replace the current summary with `今日脉络`. + +### Phase 3: Modularize + +- Extract modules under `ai_daily_report/`. +- Add CLI. +- Keep old script as compatibility entrypoint. +- Add fixture tests. + +### Phase 4: Skill Integration + +- Update `skill/SKILL.md`. +- Add references for sources, style, troubleshooting, and LLM config. +- Make Hermes/OpenClaw call the CLI. + +## Success Criteria + +The optimized pipeline should satisfy: + +- A usable Markdown report is generated whenever enough source data exists. +- Optional source failures degrade the run but do not stop it. +- LLM failures degrade individual stages but do not destroy the whole report. +- No non-duplicate item is removed by importance or editorial selection. +- Every removed duplicate has a reason. +- Every stage writes inspectable artifacts. +- A failed publish can be retried from an existing run. +- Agents can run, diagnose, replay, and publish via stable commands. diff --git a/docs/plans/2026-06-04-local-dry-run-foundation.md b/docs/plans/2026-06-04-local-dry-run-foundation.md new file mode 100644 index 0000000..a3d0359 --- /dev/null +++ b/docs/plans/2026-06-04-local-dry-run-foundation.md @@ -0,0 +1,159 @@ +# Local Dry-Run Foundation Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Make the current pipeline testable on a local machine without Hermes credentials, blog credentials, or live LLM calls. + +**Architecture:** Keep the existing single script as the compatibility entrypoint. Add small, tested helpers for project `.env` loading, dry-run token behavior, and mock LLM responses. This creates a safe base for later Stage 0-8 modularization. + +**Tech Stack:** Python standard library, `unittest`, current `script/ai_daily_blog_pipeline.py`. + +--- + +### Task 1: Add Local `.env` Loading + +**Files:** +- Modify: `script/ai_daily_blog_pipeline.py` +- Create: `tests/test_env_loading.py` + +**Step 1: Write the failing test** + +Test that `load_env()` reads project-root `.env` values when Hermes env is absent, and that real process environment variables override file values. + +**Step 2: Run test to verify it fails** + +Run: `python -m unittest tests.test_env_loading -v` + +Expected: FAIL because the script currently only reads `~/.hermes/.env`. + +**Step 3: Implement minimal code** + +Add a helper to parse env files and update `load_env()` to read: + +1. Project `.env` +2. `~/.hermes/.env` +3. process environment + +Later sources override earlier ones. + +**Step 4: Run test to verify it passes** + +Run: `python -m unittest tests.test_env_loading -v` + +Expected: PASS. + +### Task 2: Let Dry-Run Skip Blog Token Requirement + +**Files:** +- Modify: `script/ai_daily_blog_pipeline.py` +- Create: `tests/test_dry_run_config.py` + +**Step 1: Write the failing test** + +Extract a small helper such as `is_dry_run(env)` and `require_blog_token(env)`, then test: + +- `AI_DAILY_DRY_RUN=1` does not require `BLOG_SERVICE_TOKEN`. +- normal publish mode still requires a token. + +**Step 2: Run test to verify it fails** + +Run: `python -m unittest tests.test_dry_run_config -v` + +Expected: FAIL because no helper exists and `main()` checks token before dry-run. + +**Step 3: Implement minimal code** + +Move dry-run detection before token validation in `main()`. + +**Step 4: Run test to verify it passes** + +Run: `python -m unittest tests.test_dry_run_config -v` + +Expected: PASS. + +### Task 3: Add Mock LLM Mode + +**Files:** +- Modify: `script/ai_daily_blog_pipeline.py` +- Create: `tests/test_mock_llm.py` + +**Step 1: Write the failing test** + +Test that `llm_call(prompt, {"AI_DAILY_LLM_MODE": "mock"})` returns valid JSON for: + +- semantic dedup prompts +- summary rewrite prompts +- classify prompts + +Also test that guide generation can get a non-empty mock response. + +**Step 2: Run test to verify it fails** + +Run: `python -m unittest tests.test_mock_llm -v` + +Expected: FAIL because mock mode does not exist. + +**Step 3: Implement minimal code** + +Add `AI_DAILY_LLM_MODE=mock` support in `llm_call()`. + +**Step 4: Run test to verify it passes** + +Run: `python -m unittest tests.test_mock_llm -v` + +Expected: PASS. + +### Task 4: Add Markdown Smoke Test + +**Files:** +- Create: `tests/test_markdown_rendering.py` +- Modify: `script/ai_daily_blog_pipeline.py` only if necessary. + +**Step 1: Write the failing or characterization test** + +Test that `blog_markdown()` renders: + +- `## 导览` +- at least one section +- source links +- no `> >` +- no `[N]` + +**Step 2: Run test** + +Run: `python -m unittest tests.test_markdown_rendering -v` + +Expected: If it already passes, keep it as characterization coverage. If it fails because of `> >`, implement a focused fix. + +**Step 3: Implement minimal fix if needed** + +Strip leading `>` from guide text before adding blockquote syntax. + +**Step 4: Run test to verify it passes** + +Run: `python -m unittest tests.test_markdown_rendering -v` + +Expected: PASS. + +### Task 5: Run Full Verification + +**Files:** +- No new files. + +**Step 1: Run unit tests** + +Run: `python -m unittest discover -s tests -v` + +Expected: PASS. + +**Step 2: Run compile check** + +Run: `python -m py_compile script/ai_daily_blog_pipeline.py` + +Expected: exit code 0. + +**Step 3: Check git status** + +Run: `git status --short` + +Expected: only intended files are modified or added. diff --git a/script/ai_daily_blog_pipeline.py b/script/ai_daily_blog_pipeline.py index 2700366..b71003e 100644 --- a/script/ai_daily_blog_pipeline.py +++ b/script/ai_daily_blog_pipeline.py @@ -1,1104 +1,53 @@ #!/usr/bin/env python3 -import difflib -import json +from __future__ import annotations + import os -import re import sys -import time -import urllib.request -import urllib.error -import xml.etree.ElementTree as ET -from concurrent.futures import ThreadPoolExecutor, as_completed -from datetime import datetime, timedelta, timezone -from email.utils import parsedate_to_datetime from pathlib import Path -from urllib.parse import urlparse -UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36' -CST = timezone(timedelta(hours=8)) -NOW = datetime.now(CST) -TODAY = NOW.date().isoformat() -SINCE = NOW - timedelta(hours=30) -SCRIPT_DIR = Path.home() / '.hermes' / 'scripts' -OUT_DIR = SCRIPT_DIR / 'ai_morning_out' -OUT_DIR.mkdir(parents=True, exist_ok=True) +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) -RSS_FEEDS = { - 'InfoQ AI': 'https://feed.infoq.com/ai-ml-data-eng/', - 'MIT科技评论AI': 'https://www.technologyreview.com/topic/artificial-intelligence/feed', - '量子位': 'https://www.qbitai.com/feed', -} -JUYA_RSS = 'https://imjuya.github.io/juya-ai-daily/rss.xml' -SECTION_ORDER = ['模型发布/更新', '产品与工具', '开发与工程', '行业与公司', '论文与研究', '人物与花絮', '观点与教程'] +from ai_daily_report.env import read_env_file + +PROJECT_ENV_PATH = PROJECT_ROOT / ".env" +OUT_DIR = Path.home() / ".hermes" / "scripts" / "ai_morning_out" -# ─── Data collection (unchanged) ──────────────────────────────────────────── - -def fetch_text(url: str) -> str: - req = urllib.request.Request(url, headers={'User-Agent': UA}) - with urllib.request.urlopen(req, timeout=25) as r: - return r.read().decode('utf-8', 'ignore') - - -def parse_pubdate(text: str): - if not text: - return None - try: - dt = parsedate_to_datetime(text) - if dt.tzinfo is None: - dt = dt.replace(tzinfo=timezone.utc) - return dt.astimezone(CST) - except Exception: - return None - - -def clean_text(s: str) -> str: - s = re.sub(r'<[^>]+>', ' ', s or '') - s = s.replace(' ', ' ').replace('&', '&') - s = re.sub(r'\s+', ' ', s).strip() - return s - - -def source_name_from_url(url: str, fallback: str = '来源') -> str: - if not url: - return fallback - host = (urlparse(url).netloc or '').lower() - if host.startswith('www.'): - host = host[4:] - mapping = { - 'x.com': 'X', 'twitter.com': 'X', 'github.com': 'GitHub', 'github.blog': 'GitHub Blog', - 'openrouter.ai': 'OpenRouter', 'anthropic.com': 'Anthropic', 'cursor.com': 'Cursor', - 'technologyreview.com': 'MIT科技评论AI', 'the-decoder.com': 'The Decoder', 'xiaohongshu.com': '小红书', - 'mp.weixin.qq.com': '微信文章', 'qbitai.com': '量子位', 'ithome.com': 'IT之家', 'browse.sh': 'Browse.sh', - 'huggingface.co': 'Hugging Face', 'openai.com': 'OpenAI', 'claude.com': 'Claude', - 'theverge.com': 'The Verge', 'infoq.com': 'InfoQ', 'research.google': 'Google Research', - 'simonwillison.net': 'Simon Willison', 'runwayml.com': 'Runway', 'perplexity.ai': 'Perplexity', - 'venturebeat.com': 'VentureBeat', 'arxiv.org': 'arXiv', 'reuters.com': '路透社', - 'bloomberg.com': 'Bloomberg', 'techcrunch.com': 'TechCrunch', 'wired.com': 'Wired', - 'deepseek.com': 'DeepSeek', 'baidu.com': '百度', 'alibaba.com': '阿里', - } - for domain, name in mapping.items(): - if host == domain or host.endswith('.' + domain): - return name - return host or fallback - - -def x_username_from_url(url: str) -> str: - """Extract X/Twitter username from URL like https://x.com/OpenAIDevs/status/...""" - if not url: - return '' - host = (urlparse(url).netloc or '').lower() - if host.startswith('www.'): - host = host[4:] - if host not in ('x.com', 'twitter.com'): - return '' - parts = [p for p in urlparse(url).path.split('/') if p] - if len(parts) >= 1 and parts[0] not in ('i', 'search', 'explore', 'settings', 'notifications', 'home', 'compose'): - return parts[0] - return '' - - -def smart_source_label(url: str, api_source_name: str = '') -> str: - """Generate a descriptive source label from URL, preferring specific names over generic API labels.""" - x_user = x_username_from_url(url) - if x_user: - return f'X:@{x_user}' - url_name = source_name_from_url(url, '') - if url_name and url_name not in ('来源', ''): - host = (urlparse(url).netloc or '').lower() - path = (urlparse(url).path or '').lower() - if 'blog' in host or '/blog' in path or '/research' in path: - return f'{url_name}:Blog' - if '/index' in path or path.rstrip('/') in ('', '/about', '/products'): - return f'{url_name}:官网动态' - return url_name - if api_source_name and api_source_name not in ('AI HOT', '社交媒体/博客', '科技媒体', '公司官网', '公司博客', '社区/博客', '个人博客', '技术媒体'): - return api_source_name - return api_source_name or 'AI HOT' - - -def parse_aihot(today: str): - url = f'https://aihot.virxact.com/api/public/daily/{today}' - data = json.loads(fetch_text(url)) - items = [] - generated = data.get('generatedAt') - for sec in data.get('sections', []): - for it in sec.get('items', []): - item_url = (it.get('sourceUrl') or '').strip() - api_src = clean_text(it.get('sourceName', '')) or '' - items.append({ - 'source_group': 'AI HOT', - 'source_label': smart_source_label(item_url, api_src), - 'title_raw': clean_text(it.get('title', '')), - 'summary_raw': clean_text(it.get('summary', '')), - 'url': item_url, - 'published_at': generated, - 'origin_type': 'aihot_json', - 'section_hint': sec.get('label') or '', - 'language_hint': 'zh', - }) - for flash in data.get('flashes', []) or []: - flash_url = (flash.get('sourceUrl') or '').strip() - api_src = clean_text(flash.get('sourceName', '')) or '' - items.append({ - 'source_group': 'AI HOT', - 'source_label': smart_source_label(flash_url, api_src), - 'title_raw': clean_text(flash.get('title', '')), - 'summary_raw': clean_text(flash.get('summary', '')), - 'url': flash_url, - 'published_at': generated, - 'origin_type': 'aihot_flash', - 'section_hint': '快讯', - 'language_hint': 'zh', - }) - return items, data - - -def parse_rss(name: str, url: str): - xml = fetch_text(url) - root = ET.fromstring(xml) - channel = root.find('channel') - items = channel.findall('item') if channel is not None else [] - out = [] - for it in items[:20]: - pub = parse_pubdate(it.findtext('pubDate') or '') - if pub and pub < SINCE: - continue - link = (it.findtext('link') or '').strip() - title = clean_text(it.findtext('title') or '') - summary = clean_text(it.findtext('description') or '') - if not title: - continue - out.append({ - 'source_group': name, - 'source_label': name, - 'title_raw': title, - 'summary_raw': summary, - 'url': link, - 'published_at': pub.isoformat() if pub else None, - 'origin_type': 'rss', - 'section_hint': '', - 'language_hint': 'en' if len(re.findall(r'[A-Za-z]', title + ' ' + summary)) > len(re.findall(r'[\u4e00-\u9fff]', title + ' ' + summary)) else 'zh', - }) - return out - - -def fetch_juya_rss(today: str): - """Fetch 橘鸦 RSS and return (target_url, pub_date, html_content). - html_content is from content:encoded if available, else None. - Uses a longer timeout (45s) since GitHub Pages can be slow.""" - req = urllib.request.Request(JUYA_RSS, headers={'User-Agent': UA}) - with urllib.request.urlopen(req, timeout=45) as r: - xml = r.read().decode('utf-8', 'ignore') - root = ET.fromstring(xml) - channel = root.find('channel') - items = channel.findall('item') if channel is not None else [] - target = None - pub = None - html_content = None - for it in items: - title = (it.findtext('title') or '').strip() - if title == today: - target = (it.findtext('link') or '').strip() - pub = parse_pubdate(it.findtext('pubDate') or '') - # Parse from RSS content:encoded to avoid a second HTTP request - ns = {'content': 'http://purl.org/rss/1.0/modules/content/'} - content_el = it.find('content:encoded', ns) - if content_el is not None and content_el.text: - html_content = content_el.text - break - return target, pub, html_content - - -def parse_juya(today: str): - target, pub, html_content = fetch_juya_rss(today) - if not target: - return [] - - # Try RSS content:encoded first; fall back to fetching the article page - if html_content is None: - try: - req = urllib.request.Request(target, headers={'User-Agent': UA}) - with urllib.request.urlopen(req, timeout=45) as r: - html = r.read().decode('utf-8', 'ignore') - except Exception: - return [] - m = re.search(r']*>(.*?)', html, re.S | re.I) - if not m: - return [] - article_html = m.group(1) - else: - article_html = html_content - - block_pattern = re.compile( - r']*>\s*(?:]*href="(?P[^"]+)"[^>]*>)?(?P[^<]*?)?\s*#(?P\d+)\s*(?P.*?)(?=\s*提示|$)', - re.S | re.I, - ) - - results = [] - for m in block_pattern.finditer(article_html): - title_html = m.group('title_html') or '' - title = clean_text(re.sub(r'<[^>]+>', ' ', title_html)) - title_url = (m.group('title_url') or '').strip() - body_html = m.group('body') or '' - - links = re.findall(r']*href="([^"]+)"[^>]*>', body_html, re.I) - clean_links = [] - for link in links: - link = link.replace('&', '&').strip() - if not link or 'imjuya.github.io/juya-ai-daily' in link: - continue - if link not in clean_links: - clean_links.append(link) - url = clean_links[0] if clean_links else (title_url if title_url and 'imjuya.github.io/juya-ai-daily' not in title_url else target) - - body_text = body_html - body_text = re.sub(r']*>|', '\n', body_text, flags=re.I) - body_text = re.sub(r'

|||', '\n', body_text, flags=re.I) - body_text = re.sub(r']*>', '', body_text, flags=re.I) - body_text = re.sub(r']+>.*?', ' ', body_text, flags=re.S | re.I) - body_text = re.sub(r']*>', ' ', body_text, flags=re.I) - body_text = re.sub(r'<[^>]+>', ' ', body_text) - lines = [clean_text(x) for x in body_text.split('\n') if clean_text(x)] - summary_lines = [] - for line in lines: - if line.startswith('相关链接'): - break - if line == title: - continue - summary_lines.append(line) - summary = ' '.join(summary_lines[:4]).strip() - if not title: - continue - results.append({ - 'source_group': '橘鸦AI早报', - 'source_label': source_name_from_url(url, '橘鸦AI早报') if url and 'imjuya.github.io/juya-ai-daily' not in url else '橘鸦AI早报', - 'title_raw': title, - 'summary_raw': summary, - 'url': url, - 'published_at': pub.isoformat() if pub else None, - 'origin_type': 'juya_issue', - 'section_hint': '', - 'language_hint': 'zh', - }) - return results - - -# ─── LLM infrastructure (unchanged) ───────────────────────────────────────── - -def load_env(): - env = {} - env_path = Path.home() / '.hermes' / '.env' - if env_path.exists(): - text = env_path.read_text(errors='ignore') - for line in text.splitlines(): - if '=' in line and not line.strip().startswith('#'): - k, v = line.split('=', 1) - env[k.strip()] = v.strip() - env.update({k: v for k, v in os.environ.items() if v}) +def load_env() -> dict[str, str]: + env: dict[str, str] = {} + env.update(read_env_file(PROJECT_ENV_PATH)) + env.update(read_env_file(Path.home() / ".hermes" / ".env")) + env.update({key: value for key, value in os.environ.items() if value}) return env -def resolve_llm_config(env: dict): - """Read Hermes config to get the active provider's API key, base_url, and model. +def is_dry_run(env: dict[str, str]) -> bool: + return (env.get("AI_DAILY_DRY_RUN") or "").strip().lower() in {"1", "true", "yes"} - Priority: - 1) Explicit environment overrides for this pipeline (SUB2API / LLM_* / XIAOMI_* / XIAOMI_MIMO_*) - 2) Hermes model config (config.yaml) - 3) auth.json credential pool - 4) Legacy env fallbacks - """ - import yaml - hermes_dir = Path.home() / '.hermes' +def requires_blog_token(env: dict[str, str]) -> bool: + return not is_dry_run(env) - def first_env(*names: str) -> str: - for name in names: - val = (env.get(name) or '').strip() - if val: - return val - return '' - # Allow this script to be pinned to the current Hermes model config. - cfg_path = hermes_dir / 'config.yaml' - cfg = {} - if cfg_path.exists(): - with open(cfg_path) as f: - cfg = yaml.safe_load(f) or {} +def main() -> None: + from ai_daily_report.runner import run_daily_report - model_cfg = cfg.get('model', {}) or {} - provider = (model_cfg.get('provider') or '').strip() - base_url = (model_cfg.get('base_url') or '').rstrip('/') - model_name = (model_cfg.get('default') or '').strip() - - # 1) Explicit overrides for this pipeline take precedence, but keep endpoint/key/model - # from the same provider family. Mixing SUB2API_API_KEY with XIAOMI_BASE_URL causes - # 401 after switching Hermes to a Sub2API model. - explicit_api_key = first_env('LLM_API_KEY') - explicit_base_url = first_env('LLM_BASE_URL') - explicit_model = first_env('LLM_MODEL') - - if not explicit_api_key: - if provider == 'sub2api' or first_env('SUB2API_API_KEY', 'SUB2API_BASE_URL', 'SUB2API_MODEL'): - explicit_api_key = first_env('SUB2API_API_KEY') - explicit_base_url = first_env('SUB2API_BASE_URL') or base_url - explicit_model = first_env('SUB2API_MODEL') or model_name - elif first_env('XIAOMI_API_KEY', 'XIAOMI_MIMO_API_KEY', 'XIAOMI_BASE_URL', 'XIAOMI_MIMO_BASE_URL', 'XIAOMI_MODEL', 'XIAOMI_MIMO_MODEL'): - explicit_api_key = first_env('XIAOMI_API_KEY', 'XIAOMI_MIMO_API_KEY') - explicit_base_url = first_env('XIAOMI_BASE_URL', 'XIAOMI_MIMO_BASE_URL') - explicit_model = first_env('XIAOMI_MODEL', 'XIAOMI_MIMO_MODEL') - - if explicit_base_url: - base_url = explicit_base_url.rstrip('/') - if explicit_model: - model_name = explicit_model - - provider_def = (cfg.get('providers', {}) or {}).get(provider, {}) or {} - if not base_url and provider_def.get('base_url'): - base_url = str(provider_def.get('base_url')).rstrip('/') - if not explicit_api_key and provider_def.get('key_env'): - explicit_api_key = first_env(str(provider_def.get('key_env'))) - - # Fast fallback chain: if the active provider has no credentials, use a known-good - # provider/model from auth.json so the daily cron keeps publishing. - fallback_provider = first_env('LLM_FALLBACK_PROVIDER', 'XIAOMI_FALLBACK_PROVIDER') or 'openrouter' - - api_key = explicit_api_key - auth_path = hermes_dir / 'auth.json' - if not api_key and auth_path.exists(): - with open(auth_path) as f: - auth = json.load(f) - pool = auth.get('credential_pool', {}) or {} - provider_keys = [] - if provider: - provider_keys.extend([provider, provider.replace('-', '_')]) - # Known aliases for this environment. - provider_keys.extend(['sub2api', 'xiaomi', 'xiaomi_mimo', 'sensenova']) - for pkey in provider_keys: - creds = pool.get(pkey, []) - if creds: - cred = creds[0] - source = cred.get('source', '') - if source.startswith('env:'): - env_var = source[4:] - api_key = env.get(env_var, '') or api_key - if not api_key: - api_key = cred.get('access_token', '') or api_key - if not base_url: - base_url = (cred.get('base_url') or '').rstrip('/') - if not model_name: - model_name = cred.get('model', '') or model_name - break - - # 3) Legacy env fallbacks. - if not api_key: - api_key = first_env('LLM_API_KEY', 'XIAOMI_API_KEY', 'XIAOMI_MIMO_API_KEY', 'OPENROUTER_API_KEY') - if not base_url: - base_url = first_env('LLM_BASE_URL', 'XIAOMI_BASE_URL', 'XIAOMI_MIMO_BASE_URL', 'OPENROUTER_BASE_URL').rstrip('/') - if not model_name: - model_name = first_env('LLM_MODEL') or 'mimo-v2.5-pro' - - if not api_key and fallback_provider and auth_path.exists(): - with open(auth_path) as f: - auth = json.load(f) - pool = auth.get('credential_pool', {}) or {} - for pkey in [fallback_provider, fallback_provider.replace('-', '_')]: - creds = pool.get(pkey, []) - if creds: - cred = creds[0] - source = cred.get('source', '') - if source.startswith('env:'): - env_var = source[4:] - api_key = env.get(env_var, '') or api_key - if not api_key: - api_key = cred.get('access_token', '') or api_key - if not base_url: - base_url = (cred.get('base_url') or '').rstrip('/') - if not model_name: - model_name = cred.get('model', '') or model_name - provider = fallback_provider - break - - if not api_key: - raise RuntimeError( - f'No API key found for provider "{provider}" or fallback "{fallback_provider}". ' - 'Set SUB2API_API_KEY / XIAOMI_API_KEY / LLM_API_KEY or fix ~/.hermes/auth.json' - ) - if not base_url: - raise RuntimeError( - f'No base_url found for provider "{provider}" or fallback "{fallback_provider}". ' - 'Set SUB2API_BASE_URL / XIAOMI_BASE_URL / LLM_BASE_URL or fix ~/.hermes/auth.json' - ) - - return api_key, base_url, model_name - - -def _try_llm_request(base_url: str, api_key: str, model: str, prompt_text: str, auth_mode: str, api_key_header: str = 'Authorization'): - payload = json.dumps({ - 'model': model, - 'messages': [{'role': 'user', 'content': prompt_text}], - 'temperature': 0.2, - 'max_tokens': 8000, - }, ensure_ascii=False).encode('utf-8') - headers = {'Content-Type': 'application/json'} - if api_key_header == 'Authorization': - headers[api_key_header] = f'Bearer {api_key}' if auth_mode == 'bearer' else api_key - else: - headers[api_key_header] = api_key - req = urllib.request.Request(f'{base_url}/chat/completions', data=payload, headers=headers) - with urllib.request.urlopen(req, timeout=600) as r: - resp = json.loads(r.read().decode('utf-8')) - return resp['choices'][0]['message']['content'].strip() - - -def llm_call(prompt_text: str, env: dict) -> str: - api_key, base_url, model = resolve_llm_config(env) - - # Use a single, explicit path so cron behavior is easy to debug. - # The earlier auth-matrix/fallback logic was making failures harder to reason about. - payload = json.dumps({ - 'model': model, - 'messages': [{'role': 'user', 'content': prompt_text}], - 'temperature': 0.2, - 'max_tokens': 8000, - }, ensure_ascii=False).encode('utf-8') - - req = urllib.request.Request( - f'{base_url}/chat/completions', - data=payload, - headers={'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'}, - ) - print(f'llm_call request: base_url={base_url}; model={model}', file=sys.stderr) - try: - with urllib.request.urlopen(req, timeout=600) as r: - resp = json.loads(r.read().decode('utf-8')) - return resp['choices'][0]['message']['content'].strip() - except urllib.error.HTTPError as e: - body = '' - try: - body = e.read().decode('utf-8', 'ignore') - except Exception: - pass - print(f'llm_call failed: HTTP {e.code} {e.reason}; base_url={base_url}; model={model}; body={body[:500]}', file=sys.stderr) - raise - - -def _parse_json_from_llm(text: str): - """Strip markdown code blocks and extract a JSON object from LLM output.""" - text = re.sub(r'^```(?:json)?\s*\n?', '', text) - text = re.sub(r'\n?```\s*$', '', text) - text = text.strip() - m = re.search(r'\{.*\}\s*$', text, re.S) - if not m: - raise ValueError('LLM 输出中未找到 JSON 对象') - raw_json = m.group(0) - raw_json = re.sub(r',\s*([}\]])', r'\1', raw_json) - return json.loads(raw_json) - - -def _normalize_title(title: str) -> str: - """Normalize a title for dedup comparison: strip non-alphanumeric, lowercase.""" - return re.sub(r'[^\w\u4e00-\u9fff]+', '', (title or '').lower()) - - -# ─── Stage 0: Script dedup (no LLM) ──────────────────────────────────────── - -def stage0_script_dedup(raw_items: list) -> list: - """Deduplicate using difflib.SequenceMatcher on normalized titles. - Similarity > 0.7 means same event; keep the one with longer summary.""" - if not raw_items: - return [] - - # Build list of (normalized_title, item) - normed = [] - for item in raw_items: - nt = _normalize_title(item.get('title_raw', '')) - if nt and len(nt) >= 3: - normed.append((nt, item)) - - keep = [] # list of (nt, item) to keep - for nt, item in normed: - merged = False - for i, (knt, kitem) in enumerate(keep): - ratio = difflib.SequenceMatcher(None, nt, knt).ratio() - if ratio > 0.7: - # Same event — keep the one with longer summary - if len(item.get('summary_raw', '')) > len(kitem.get('summary_raw', '')): - keep[i] = (nt, item) - merged = True - break - if not merged: - keep.append((nt, item)) - - return [item for _, item in keep] - - -# ─── Stage 1: LLM semantic dedup ─────────────────────────────────────────── - -def stage1_llm_dedup(items: list, env: dict): - """Use LLM to identify semantic duplicates. Returns (filtered_items, error).""" - if not items: - return items, None - - indexed = [] - for i, item in enumerate(items): - indexed.append({ - 'index': i, - 'title': item.get('title_raw', '')[:80], - 'summary': item.get('summary_raw', '')[:120], - }) - - prompt = ( - '以下是AI领域的新闻条目。有些条目虽然措辞不同,但描述的是同一个事件。' - '请识别重复项,输出要保留的条目索引列表。只有描述完全相同的具体事件才视为重复。\n\n' - f'{json.dumps(indexed, ensure_ascii=False)}\n\n' - '请严格按以下JSON格式输出,不要包含任何其他内容:\n' - '{"keep_indices": [0, 1, 3, 5]}' - ) - - try: - raw = llm_call(prompt, env) - obj = _parse_json_from_llm(raw) - indices = obj.get('keep_indices', []) - if not isinstance(indices, list): - raise ValueError('keep_indices is not a list') - # Filter valid indices - valid = sorted(set(i for i in indices if isinstance(i, int) and 0 <= i < len(items))) - if not valid: - raise ValueError('No valid indices in keep_indices') - return [items[i] for i in valid], None - except Exception as e: - err = f'stage1_llm_dedup failed: {type(e).__name__}: {e}' - print(err) - return items, err # Fallback: return all items unchanged - - -# ─── Stage 2a: LLM summary rewrite (parallel) ────────────────────────────── - -def stage2a_rewrite_summaries(items: list, env: dict): - """Rewrite summaries in concise Chinese. Returns (updated_items, error).""" - if not items: - return items, None - - indexed = [] - for i, item in enumerate(items): - indexed.append({ - 'index': i, - 'title': item.get('title_raw', '')[:80], - 'summary': item.get('summary_raw', '')[:200], - }) - - prompt = ( - '请将以下新闻条目的标题和摘要改写为简洁中文。' - '标题:英文品牌名/模型名保留原样(如GPT-5、Codex),其余翻译为中文。' - '摘要:每条最多120字,保留核心事实。\n\n' - f'{json.dumps(indexed, ensure_ascii=False)}\n\n' - '请严格按以下JSON格式输出:\n' - '{"summaries": [{"index": 0, "title": "中文标题", "summary": "改写后的摘要"}, ...]}' - ) - - try: - raw = llm_call(prompt, env) - obj = _parse_json_from_llm(raw) - summaries = obj.get('summaries', []) - if not isinstance(summaries, list): - raise ValueError('summaries is not a list') - - result = [dict(item) for item in items] # shallow copy - for entry in summaries: - idx = entry.get('index') - s = entry.get('summary', '') - t = entry.get('title', '') - if isinstance(idx, int) and 0 <= idx < len(result): - if t: - result[idx] = dict(result[idx], title_raw=t) - if s: - result[idx] = dict(result[idx], summary_raw=s) - - return result, None - except Exception as e: - err = f'stage2a_rewrite_summaries failed: {type(e).__name__}: {e}' - print(err) - return items, err # Fallback: return items unchanged - - -# ─── Stage 2b: LLM classify (parallel) ────────────────────────────────────── - -def stage2b_classify(items: list, env: dict): - """Classify each item into a section. Returns (updated_items, error).""" - if not items: - return items, None - - indexed = [] - for i, item in enumerate(items): - indexed.append({ - 'index': i, - 'title': item.get('title_raw', '')[:80], - 'summary': item.get('summary_raw', '')[:120], - }) - - sections_str = '、'.join(SECTION_ORDER) - prompt = ( - f'请将以下AI新闻条目分类到对应板块。\n' - f'可选板块:{sections_str}\n\n' - f'{json.dumps(indexed, ensure_ascii=False)}\n\n' - '请严格按以下JSON格式输出:\n' - '{"sections": [{"index": 0, "section": "模型发布/更新"}, ...]}' - ) - - try: - raw = llm_call(prompt, env) - obj = _parse_json_from_llm(raw) - sections = obj.get('sections', []) - if not isinstance(sections, list): - raise ValueError('sections is not a list') - - result = [dict(item) for item in items] # shallow copy - for entry in sections: - idx = entry.get('index') - sec = entry.get('section', '') - if isinstance(idx, int) and 0 <= idx < len(result) and sec: - if sec in SECTION_ORDER: - result[idx] = dict(result[idx], section_hint=sec) - - return result, None - except Exception as e: - err = f'stage2b_classify failed: {type(e).__name__}: {e}' - print(err) - return items, err # Fallback: return items unchanged - - -# ─── Stage 2 parallel execution ───────────────────────────────────────────── - -def stage2_parallel(items: list, env: dict): - """Run stage2a (summary rewrite) and stage2b (classify) in parallel. - Returns (merged_items, errors_list).""" - errors = [] - summaries_result = items - classify_result = items - - with ThreadPoolExecutor(max_workers=2) as executor: - future_summaries = executor.submit(stage2a_rewrite_summaries, items, env) - future_classify = executor.submit(stage2b_classify, items, env) - - # Wait for summary rewrite - try: - summaries_result, err = future_summaries.result() - if err: - errors.append(err) - except Exception as e: - errors.append(f'stage2a exception: {type(e).__name__}: {e}') - - # Wait for classify - try: - classify_result, err = future_classify.result() - if err: - errors.append(err) - except Exception as e: - errors.append(f'stage2b exception: {type(e).__name__}: {e}') - - # Merge: take summaries from stage2a, sections from stage2b - merged = [] - for i in range(len(items)): - new_item = dict(summaries_result[i]) if i < len(summaries_result) else dict(items[i]) - # Apply section from classify result if available - if i < len(classify_result) and classify_result[i].get('section_hint'): - new_item['section_hint'] = classify_result[i]['section_hint'] - merged.append(new_item) - - return merged, errors - - -# ─── Stage 3: LLM guide/observation ──────────────────────────────────────── - -def llm_generate_guide(items, today: str, env: dict) -> str: - """Generate editorial judgment section: main theme + signals + risk.""" - indexed = [] - for i, item in enumerate(items, 1): - indexed.append({ - 'n': i, - 'title': item['title'], - 'summary': item['summary'][:100], - 'section': item['section'], - 'source': item.get('source', ''), - }) - prompt = { - 'date': today, - 'task': ( - '你是AI行业编辑。根据以下已经分类和摘要改写好的条目,写「今日观察」。\n\n' - '格式要求:\n' - '【主线】blockquote格式,一句话概括今天最值得关注的趋势(不要套话,要具体)\n' - '【强信号】2-3条,每条格式:编号. 标题(一句话)+ 一两句说明为什么重要\n' - '【中信号】1-2条,格式同上\n' - '【待验证】1-2条,格式同上,说明为什么存疑\n\n' - '写作要求:\n' - '- 不要空泛总结(如"行业焦点转向XX"),要指向具体事件\n' - '- 不要引用编号如[1][3],读者看不到对应关系\n' - '- 不要建议("开发者应该..."之类删掉)\n' - '- 每条控制在2-3句话以内\n' - '- 用大白话,不要学术腔\n' - ), - 'items': indexed, - 'rule': '只输出观察文本,不要代码块、不要JSON。严格使用【主线】【强信号】【中信号】【待验证】四个标记。' - } - query = json.dumps(prompt, ensure_ascii=False) - try: - text = llm_call(query, env) - text = re.sub(r'^```(?:\w+)?\s*\n?', '', text) - text = re.sub(r'\n?```\s*$', '', text) - text = text.strip().strip('"').strip("'") - return text - except Exception: - return '' - - -# ─── Rendering helpers (unchanged) ────────────────────────────────────────── - -def _parse_guide_sections(guide: str): - """Parse guide text into structured sections by 【markers】.""" - sections = {} - parts = re.split(r'【(主线|强信号|中信号|待验证|建议)】', guide) - i = 1 - while i < len(parts) - 1: - key = parts[i].strip() - content = parts[i + 1].strip() - sections[key] = content - i += 2 - return sections - - -def _make_ref_factory(items): - """Create a [N] → link converter bound to the items list.""" - def make_ref(m): - idx = int(m.group(1)) - if 1 <= idx <= len(items): - item = items[idx - 1] - url = item.get('url', '') - if url: - return f'[{idx}]' - return f'[{idx}]' - return m.group(0) - return make_ref - - -def _render_guide_section(lines, title, text, items, is_quote=False): - """Render a guide section with title on its own line, content below.""" - make_ref = _make_ref_factory(items) - lines.append(f'**{title}**') - lines.append('') - for gline in text.split('\n'): - gline = gline.strip() - if not gline: - continue - gline = re.sub(r'\[(\d+)\]', make_ref, gline) - gline = re.sub(r'\[N\]', '', gline) - gline = gline.strip() - if not gline: - continue - if is_quote: - lines.append(f'> {gline}') - else: - lines.append(gline) - lines.append('') - - -def format_source_link(item): - source = item.get('source') or '来源' - url = item.get('url') or '' - if url: - return f'[{source} ↗]({url})' - return source - - -def blog_markdown(items, guide=None): - grouped = {k: [] for k in SECTION_ORDER} - for item in items: - grouped.setdefault(item['section'], []).append(item) - n = 1 - lines = [] - - guide_items = guide if isinstance(guide, list) else [] - make_ref = _make_ref_factory(items) - - def clean_guide_text(text): - text = re.sub(r'\[\d+\]', '', text) - text = re.sub(r'\[N\]', '', text).strip() - text = re.sub(r'^主线判断[::]\s*', '', text) - text = re.sub(r'\s+', ' ', text).strip() - return text - - # === Top: 导览 (theme only) === - theme_items = [g for g in guide_items if g.get('type') == 'theme'] - if theme_items: - lines.append('## 导览') - lines.append('') - for g in theme_items: - text = clean_guide_text(g.get('text', '')) - if text: - for para in text.split('\n'): - para = para.strip() - if para: - lines.append(f'> {para}') - lines.append('') - - # === News sections === - for sec in SECTION_ORDER: - sec_items = grouped.get(sec, []) - if not sec_items: - continue - lines.append(f'## {sec}') - lines.append('') - for item in sec_items: - summary = item['summary'].strip() - if len(summary) > 120: - summary = summary[:120].rstrip() + '…' - source_link = format_source_link(item) - if summary and summary[-1] not in '。!?…': - summary += '。' - lines.append(f'**{n}. {item["title"]}**') - lines.append('') - lines.append(f'> {summary}{source_link}') - lines.append('') - n += 1 - - # === Bottom: 总结 (strong/medium/risk) === - type_labels = {'strong': '强信号', 'medium': '中信号', 'risk': '待验证'} - summary_types = ['strong', 'medium', 'risk'] - summary_items = [g for g in guide_items if g.get('type') in summary_types] - if summary_items: - lines.append('## 总结') - lines.append('') - for t in summary_types: - type_items = [g for g in summary_items if g.get('type') == t] - if not type_items: - continue - label = type_labels.get(t, t) - lines.append(f'**{label}**') - lines.append('') - for g in type_items: - text = clean_guide_text(g.get('text', '')) - if not text: - continue - title_match = re.search(r'^(.+?)[::]\s*', text) - if title_match and len(title_match.group(1)) < 60: - title = title_match.group(1).strip() - content = text[title_match.end():].strip() - else: - sentences = re.split(r'[。!?]', text) - title = sentences[0].strip() if sentences else text[:40] - content = text[len(sentences[0]):].strip() - if content and content[0] in '。!?': - content = content[1:].strip() - lines.append(f'- **{title}**') - if content: - lines.append(f' {content}') - lines.append('') - - return '\n'.join(lines).strip() - - -def short_summary(blog_url): - return f'AI日报已发布 👉 {blog_url}' - - -def blog_api_request(method, path, payload=None, token=None, base_url=None): - url = base_url.rstrip('/') + path - data = None - headers = {'Authorization': f'Bearer {token}', 'User-Agent': UA} - if payload is not None: - data = json.dumps(payload, ensure_ascii=False).encode('utf-8') - headers['Content-Type'] = 'application/json' - req = urllib.request.Request(url, data=data, headers=headers, method=method) - with urllib.request.urlopen(req, timeout=25) as r: - return json.loads(r.read().decode('utf-8')) - - -# ─── Main pipeline ────────────────────────────────────────────────────────── - -def main(): env = load_env() - token = env.get('BLOG_SERVICE_TOKEN') or env.get('EPHRON_SERVICE_TOKEN') - base_url = env.get('BLOG_API_BASE_URL', 'https://blog.ephron.ren') - if not token: - print('缺少 blog service token,已停止。') - sys.exit(1) - - errors = [] - source_counts = {} - raw_items = [] - - # ── Collect raw items (unchanged) ──────────────────────────────────────── - try: - aihot_items, raw_daily = parse_aihot(TODAY) - raw_items.extend(aihot_items) - source_counts['AI HOT'] = len(aihot_items) - except urllib.error.HTTPError as e: - if e.code == 404: - print(f'今天({TODAY})的 AI HOT 完整日报还没有生成,暂不发布。') - return - raise - - for name, url in RSS_FEEDS.items(): - try: - parsed = parse_rss(name, url) - raw_items.extend(parsed) - source_counts[name] = len(parsed) - except Exception as e: - errors.append(f'{name}: {type(e).__name__}') - source_counts[name] = 0 - - juya_items = [] - try: - juya_items = parse_juya(TODAY) - except Exception as e: - errors.append(f'橘鸦AI早报: {type(e).__name__}') - - # If juya returned nothing, wait 2 minutes and retry once - if not juya_items: - print('橘鸦AI早报尚未就绪,等待 2 分钟后重试...') - time.sleep(120) - try: - juya_items = parse_juya(TODAY) - except Exception as e: - errors.append(f'橘鸦AI早报(重试): {type(e).__name__}') - - raw_items.extend(juya_items) - source_counts['橘鸦AI早报'] = len(juya_items) - - raw_path = OUT_DIR / 'raw_items.json' - raw_path.write_text(json.dumps(raw_items, ensure_ascii=False, indent=2), encoding='utf-8') - - # ── Stage 0: Script dedup ──────────────────────────────────────────────── - print(f'Stage 0: Script dedup — {len(raw_items)} raw items') - items = stage0_script_dedup(raw_items) - stage0_count = len(items) - print(f'Stage 0 done — {stage0_count} unique items') - - # ── Stage 1: LLM semantic dedup ───────────────────────────────────────── - print(f'Stage 1: LLM semantic dedup') - items, stage1_err = stage1_llm_dedup(items, env) - if stage1_err: - errors.append(stage1_err) - print(f'Stage 1 done — {len(items)} items') - - # ── Stage 2: Parallel summary rewrite + classify ──────────────────────── - print(f'Stage 2: Parallel summary rewrite + classify') - items, stage2_errs = stage2_parallel(items, env) - errors.extend(stage2_errs) - print(f'Stage 2 done — {len(items)} items') - - # ── Build final items with title/source fields ────────────────────────── - # At this point items still have raw fields; convert to final format - final_items = [] - seen_titles = set() - for item in items: - title = clean_text(item.get('title_raw', '')) - summary = clean_text(item.get('summary_raw', ''))[:120] - if not title: - continue - norm = _normalize_title(title) - if norm in seen_titles: - continue - seen_titles.add(norm) - section = item.get('section_hint', '') or '行业与公司' - if section not in SECTION_ORDER: - section = '行业与公司' - final_items.append({ - 'title': title, - 'summary': summary or '该条目暂无摘要。', - 'section': section, - 'url': item.get('url') or '', - 'source': item.get('source_label') or item.get('source_group') or '来源', - 'source_group': item.get('source_group') or '未知来源', - 'dedupe_keys': [norm], - }) - - # ── Stage 3: LLM guide/observation ────────────────────────────────────── - print(f'Stage 3: LLM guide generation') - guide_text = llm_generate_guide(final_items, TODAY, env) - - # Parse guide into structured format for blog_markdown - guide_structured = [] - if guide_text: - parsed = _parse_guide_sections(guide_text) - type_map = {'主线': 'theme', '强信号': 'strong', '中信号': 'medium', '待验证': 'risk'} - for key, text in parsed.items(): - guide_type = type_map.get(key, 'theme') - if guide_type == 'theme': - guide_structured.append({'type': 'theme', 'text': text}) - else: - # Split into individual items by numbered lines - lines = [l.strip() for l in text.split('\n') if l.strip()] - for line in lines: - # Remove leading number like "1. " - line = re.sub(r'^\d+[\.\、]\s*', '', line) - if line: - guide_structured.append({'type': guide_type, 'text': line}) - - # ── Stage 4: Assemble and publish ─────────────────────────────────────── - print(f'Stage 4: Assemble and publish') - md = blog_markdown(final_items, guide_structured) - title = f'AI日报 · {TODAY}' - tags = ['AI日报', 'AI资讯', '人工智能'] - payload = {'title': title, 'content': md, 'tags': tags} - - dry_run = (env.get('AI_DAILY_DRY_RUN') or '').strip().lower() in ('1', 'true', 'yes') - if dry_run: - slug = f'dry-run-{TODAY}' - blog_url = f'{base_url}/posts/{slug}' - public_ok = True - print('AI_DAILY_DRY_RUN=1:已完成组装验证,跳过博客创建/发布。') - else: - create_resp = blog_api_request('POST', '/api/service/posts', payload=payload, token=token, base_url=base_url) - slug = create_resp.get('slug') - if not slug: - print('Blog 草稿创建失败:未返回 slug') - sys.exit(1) - blog_api_request('POST', f'/api/service/posts/{slug}/publish', token=token, base_url=base_url) - blog_url = f'{base_url}/posts/{slug}' - - public_ok = False - try: - req = urllib.request.Request(blog_url, headers={'User-Agent': UA}) - with urllib.request.urlopen(req, timeout=20) as r: - public_ok = getattr(r, 'status', None) == 200 - except Exception: - public_ok = False - - msg = short_summary(blog_url) - if errors: - msg += '\n\n注:部分补充源本次采集失败或LLM阶段出错,已自动降级:' + ';'.join(errors) - if not public_ok: - msg += '\n\n警告:blog 草稿/发布接口已返回成功,但公开链接暂未验证为 200,请人工复核。' - - # Build digest for JSON output - digest = { - 'items': final_items, - 'featured_titles': [i['title'] for i in final_items[:6]], - 'guide': guide_structured, - } - - (OUT_DIR / 'llm_digest.json').write_text(json.dumps(digest, ensure_ascii=False, indent=2), encoding='utf-8') - (OUT_DIR / 'blog_markdown.md').write_text(md, encoding='utf-8') - (OUT_DIR / 'chat_summary.txt').write_text(msg, encoding='utf-8') - (OUT_DIR / 'run_meta.json').write_text(json.dumps({ - 'date': TODAY, - 'slug': slug, - 'blog_url': blog_url, - 'public_ok': public_ok, - 'errors': errors, - 'aihot_sections': [s.get('label') for s in raw_daily.get('sections', [])], - 'raw_item_count': len(raw_items), - 'stage0_count': stage0_count, - 'final_item_count': len(final_items), - 'has_juya': any(i.get('source_group') == '橘鸦AI早报' for i in raw_items), - 'source_counts': source_counts, - 'featured_titles': digest.get('featured_titles', []), - }, ensure_ascii=False, indent=2), encoding='utf-8') - - print(msg) + dry_run = is_dry_run(env) + run_daily_report( + run_date=env.get("AI_DAILY_RUN_DATE") or "today", + mode="dry-run" if dry_run else env.get("AI_DAILY_MODE", "publish"), + source_mode=env.get("AI_DAILY_SOURCE_MODE", "live"), + llm_mode=env.get("AI_DAILY_LLM_MODE", "live"), + out_dir=Path(env.get("AI_DAILY_OUT_DIR") or OUT_DIR), + base_url=env.get("BLOG_API_BASE_URL", "https://blog.ephron.ren"), + sources_path=Path(env["AI_DAILY_SOURCES_PATH"]) if env.get("AI_DAILY_SOURCES_PATH") else None, + env=env, + ) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/script/blog_markdown.md b/script/blog_markdown.md deleted file mode 100644 index 2d77cda..0000000 --- a/script/blog_markdown.md +++ /dev/null @@ -1,198 +0,0 @@ -## 导览 - -> > 微软与OpenAI正式分家、Anthropic提交招股书、DeepSeek计划融500亿——AI行业正在从“联盟军”转向“诸侯争霸”。 - -## 模型发布/更新 - -**1. Grok Imagine 1.5 预览版发布** - -> Grok Imagine 1.5 预览版即日起在 API 中上线,SpaceXAI 持续发力。[X:@cb_doge ↗](https://x.com/cb_doge/status/2062242490745594085) - -**2. MiniMax M3 1M token 解码加速 15.6 倍** - -> MiniMax M3 在 1M token 下解码加速 15.6 倍,FireworksAI_HQ 提供推理支持。[X:@MiniMax_AI ↗](https://x.com/MiniMax_AI/status/2062316914618388758) - -**3. Miso One 开源语音模型:8B 参数、110ms 延迟、一次语音克隆** - -> Miso One 发布 8B 参数开源语音模型,支持一次语音克隆(短样本),推理延迟 110ms,权重已开源,可自托管,API 即将推出,演示已上线。[X:@kimmonismus ↗](https://x.com/kimmonismus/status/2062210845308780639) - -**4. Ideogram v4.0 发布:2K 分辨率和 JSON 提示支持** - -> Ideogram v4.0 发布,原生 2K 分辨率,文字渲染出色,支持 JSON 提示词,可在 Krea 中体验。[X:@krea_ai ↗](https://x.com/krea_ai/status/2062227837130887567) - -## 产品与工具 - -**5. Meta 面向 WhatsApp Business 的 AI 智能体现已全球上线** - -> Meta 为 WhatsApp Business 推出的 AI 智能体面向全球商家开放,按模型 token 使用量收费。[TechCrunch ↗](https://techcrunch.com/2026/06/03/metas-ai-agent-for-whatsapp-business-is-now-available-globally) - -**6. NousResearch 发布 Hermes Agent 桌面应用公测版** - -> NousResearch 推出 Hermes Agent 桌面应用公测版。[X:@SiliconFlowAI ↗](https://x.com/SiliconFlowAI/status/2062042813852995899) - -**7. xAI Grok 语音模型上线 Vapi 平台** - -> xAI 的 Grok STT 和 TTS 语音模型登陆企业语音 AI 平台 Vapi,可用于构建自定义语音智能体。[X:@xai ↗](https://x.com/xai/status/2062209374039499178) - -**8. Grok 模型登陆 Cloudflare AI Gateway** - -> Grok 模型现已可在 Cloudflare AI Gateway 上试用。[X:@xai ↗](https://x.com/xai/status/2062294202625696081) - -**9. OpenShell v0.0.55 发布:新增 Vertex AI 推理支持** - -> OpenShell v0.0.55 发布,新增 Google Vertex AI 推理支持,改进策略可见性、Podman 检测和 GPU 沙箱行为。[X:@NVIDIAAI ↗](https://x.com/NVIDIAAI/status/2062210034109677665) - -**10. Replit 上线 SEO Agent 助应用被发现** - -> Replit 推出 SEO Agent,扫描应用并提供修复建议,帮助应用在网页和 AI 搜索中被发现。[X:@Replit ↗](https://x.com/Replit/status/2062211976995188871) - -**11. OpenClaw 2026.6.1 发布:新增 Windows 节点与技能工坊** - -> OpenClaw 2026.6.1 发布,新增原生 Windows 节点主机、技能工坊和工作板编排,支持 MiniMax M3。[X:@openclaw ↗](https://x.com/openclaw/status/2062288421406785710) - -**12. Reachy Mini 添加 MCP 工具** - -> Reachy Mini 推出公开 MCP canary Space,支持远程工具调用。[Hugging Face:Blog ↗](https://huggingface.co/blog/adding-mcp-tools-to-reachy-mini) - -**13. 刚刚,Meta Skill 来了** - -> GitHub 热门仓库 OpenSquilla 发布,代表 Meta Skill 新动向。[量子位 ↗](https://www.qbitai.com/2026/06/428335.html) - -## 开发与工程 - -**14. Qwen Cloud 全球 AI 黑客马拉松启动** - -> 首届 Qwen Cloud 全球 AI 黑客马拉松启动,5 大赛道,总奖金超 7 万美元(赛道冠军 1 万美元),Devpost 报名。[X:@alibaba_cloud ↗](https://x.com/alibaba_cloud/status/2062113338994172169) - -**15. 洪水韧性新篇章:Google 开源水文建模框架** - -> Google Research 开源基于 PyTorch 的水文建模框架,采用 Flood Hub 相同架构,允许各国气象部门在本地训练 AI 洪水预报模型。[Google Research:Blog ↗](https://research.google/blog/the-next-chapter-in-flood-resilience-open-sourcing-googles-hydrology-framework) - -**16. 文章:导致 Spark 在 Kubernetes 上 OOM 失败的两个错误配置** - -> 迁移 Spark 到 AKS 后,两个配置交互导致 OOM:spark.kubernetes.local.dirs.tmpfs 使 shuffle spill 改用 RAM 而非磁盘。[InfoQ AI ↗](https://www.infoq.com/articles/spark-oom-kubernetes-misconfigurations/?utm_campaign=infoq_content&utm_source=infoq&utm_medium=feed&utm_term=AI%2C+ML+%26+Data+Engineering) - -## 行业与公司 - -**17. 微软与 OpenAI 分道扬镳——如今双方准备正面交锋** - -> 微软与 OpenAI 合作关系破裂,进入直接竞争。微软 AI 主管 Mustafa Suleyman 称微软需独立证明能力。[The Verge ↗](https://www.theverge.com/ai-artificial-intelligence/942242/microsoft-build-ai-agents-openai-competition) - -**18. 欧盟公布全面技术主权计划,推动芯片与 AI 自主发展** - -> 欧盟推出技术主权计划,扩大本土半导体、AI 和云计算供应链,减少对美亚依赖。[Bloomberg ↗](https://www.bloomberg.com/news/articles/2026-06-03/europe-unveils-sweeping-tech-sovereignty-plan-to-boost-chips-ai) - -**19. Sensor Tower:OpenAI 旗下 ChatGPT 月活已破 10 亿,史上最快** - -> Sensor Tower 估计 ChatGPT 月活于 2025 年 5 月突破 10 亿,增速史上最快;Claude 月活 5600 万,同比增 640%。[IT之家 ↗](https://www.ithome.com/0/959/083.htm) - -**20. 消息称 DeepSeek 首轮融资拟筹集 500 亿元,腾讯、宁德时代等参投** - -> DeepSeek 首轮拟融资 500 亿元,投后估值 3500-4000 亿元。创始人梁文峰出资 200 亿,腾讯拟投 100 亿,宁德时代 50 亿。[IT之家 ↗](https://www.ithome.com/0/959/249.htm) - -**21. Suno 完成 4 亿美元 D 轮融资** - -> Suno 完成 4 亿美元 D 轮融资,估值 54 亿美元,致力于让更多人体验音乐制作。[X:@suno ↗](https://x.com/suno/status/2062183524887675243) - -**22. 宏利香港与阿里云达成 AI 战略合作** - -> 宏利香港与阿里云建立战略合作,共建负责任 AI 创新框架,加速 AI 部署。[X:@alibaba_cloud ↗](https://x.com/alibaba_cloud/status/2062006591377829922) - -**23. 优步每月 1,500 美元的 AI 使用上限为 AI 工具定价提供参考** - -> 优步将 AI 工具月使用上限设为 1500 美元,为行业 AI 定价提供参考信号。[Simon Willison ↗](https://simonwillison.net/2026/Jun/3/uber-caps-usage) - -**24. 世界模型榜首易主!跨维智能登顶 WorldArena** - -> 跨维智能在 WorldArena 上登顶,成为世界模型新榜首。[量子位 ↗](https://www.qbitai.com/2026/06/428435.html) - -**25. 刚刚,Anthropic 提交了招股书!** - -> Anthropic 已提交招股书,预计最快 Q4 上市。[量子位 ↗](https://www.qbitai.com/2026/06/428407.html) - -## 论文与研究 - -**26. 斯坦福大学法学院研究:人工智能的表现优于法学教授** - -> 斯坦福大学法学院研究显示,AI 表现优于法学教授,该结果在 Hacker News 获 104 个 Points。[law.stanford.edu ↗](https://law.stanford.edu/press/ai-outperforms-law-professors-in-stanford-law-study) - -**27. NVIDIA Research 在 CVPR 2026 发表三篇论文:规模化训练实现抓取、自动驾驶与智能体泛化** - -> NVIDIA Research 在 CVPR 2026 发表三篇论文:零样本抓取模型 GraspGen-X、自动驾驶 LCDrive、具身智能体 NitroGen,均基于大规模训练。[blogs.nvidia.com:Blog ↗](https://blogs.nvidia.com/blog/cvpr-research-grasping-driving-agent-training) - -**28. Anthropic 分析 832 个 AI 恶意账户:中高风险攻击者半年从 33% 跃至 56%** - -> Anthropic 分析 832 个被封恶意账户,67.3% 使用 AI 编写恶意软件,中高风险占比半年内从 33% 升至 56%,传统威胁评估失效。[Anthropic ↗](https://www.anthropic.com/news/AI-enabled-cyber-threats-mitre-attack) - -**29. 微软研究:装瓶厂 AI 从聊天到决策** - -> 微软在中西部装瓶厂试点三个月显示,AI 超越聊天进入决策领域,需应对真实风险和可靠性要求。[X:@MSFTResearch ↗](https://x.com/MSFTResearch/status/2062204914223169635) - -**30. 世界模型的功能分类** - -> World Labs 与李飞飞发文梳理“世界模型”概念,基于 POMDP 框架分类,指出当前所谓世界模型本质是同一循环的不同投影(如渲染器)。[X:@drfeifei ↗](https://x.com/drfeifei/status/2062247238143996275) - -**31. 从看懂世界到做对动作,卧安机器人 OneModel 1.7 用一条「隐式通路」打通了具身智能的关键断层** - -> 卧安机器人 OneModel 1.7 通过隐式通路在潜在空间完成信息传导,打通具身智能关键断层。[量子位 ↗](https://www.qbitai.com/2026/06/428703.html) - -## 人物与花絮 - -**32. 黄仁勋与纳德拉共议智能体 AI 时代** - -> 黄仁勋与纳德拉在台北 MSBuild 同台,展示 NVIDIA 与微软从 Windows 到 AI 工厂的协作。[X:@nvidia ↗](https://x.com/nvidia/status/2062228974273716457) - -**33. Satya Nadella 谈微软 Build 大会主旨演讲** - -> Satya Nadella 在 Microsoft Build 主旨演讲,强调共同构建前沿智能生态系统。[X:@satyanadella ↗](https://x.com/satyanadella/status/2062022060176801826) - -**34. Karpathy 的 llm-wiki 项目获超五千星** - -> @karpathy 的 llm-wiki 项目几周内获 5000+ 星,理念是让 LLM 构建并维护可持续进化的维基知识库。[X:@SiliconFlowAI ↗](https://x.com/SiliconFlowAI/status/2062054848762450324) - -## 观点与教程 - -**35. 智能体工程实战窍门全录** - -> @mvanhorn 分享智能体工程方法论:人主导方向、智能体执行,核心为 plan.md 约束行为,总结 22 条实战技巧及完整工具栈。[X:@shao__meng ↗](https://x.com/shao__meng/status/2061974983094755575) - -**36. Anthropic 用 Claude 赋能自助数据分析** - -> Anthropic 用 Claude 自动化 95% 业务分析查询,准确率约 95%,通过智能体分析栈解决概念-实体歧义等三大错误来源。[Claude:Blog ↗](https://claude.com/blog/how-anthropic-enables-self-service-data-analytics-with-claude) - -**37. 超越聊天机器人的直接偏好优化** - -> Dharma-AI 在 Hugging Face 博客发文,探讨直接偏好优化(DPO)在聊天机器人之外的广泛应用。[Hugging Face:Blog ↗](https://huggingface.co/blog/Dharma-AI/direct-preference-optimization-beyond-chatbots) - -**38. 演讲:选择你的 AI 副驾驶:最大化开发效率** - -> Sepehr Khosravi 探讨开发效率工具演变,评估 Cursor 和 Claude Code 等优势,为高级工程师提供可行技巧。[InfoQ AI ↗](https://www.infoq.com/presentations/choosing-ai-copilot/?utm_campaign=infoq_content&utm_source=infoq&utm_medium=feed&utm_term=AI%2C+ML+%26+Data+Engineering) - -## 总结 - -**强信号** - -- **微软与OpenAl分道扬镳,双方开始正面竞争** - 合作终结后,微软AI主管Mustafa Suleyman称公司必须独立证明能力,这意味着微软将不再依赖OpenAI的模型,而是全力押注自研,OpenAI也失去最大云盟友。 - -- **Anthropic提交招股书,预计最快Q4上市** - 这标志着安全派AI公司正式进入资本市场,与OpenAI争夺投资者注意,Claude的月活同比增长640%也为其估值提供了底气。 - -- **ChatGPT月活突破10亿,成为史上增长最快的应用** - Sensor Tower数据显示ChatGPT在2025年5月达到这一里程碑,Claude月活5600万,两家头部消费级AI应用的用户粘性正在拉开差距。 - -**中信号** - -- **Miso One发布8B开源语音模型,支持一次语音克隆且延迟仅110ms** - 权重已开放、可自托管,意味着实时语音克隆的门槛从专有API降到了个人部署,可能加速语音交互在开发者中的普及。 - -- **欧盟公布全面技术主权计划,推动芯片与AI自主发展** - 计划扩大本土半导体、AI和云计算供应链,目标减少对美亚依赖——这将对全球AI公司的合规、市场准入和数据主权产生实质影响。 - -**待验证** - -- **DeepSeek首轮融资拟筹500亿元,腾讯、宁德时代参投** - 投后估值高达3500-4000亿元,但融资消息来源为IT之家,未见官方确认。如此大体量的AI融资在国内市场是否顺利落地,存在不确定性。 - -- **跨维智能登顶WorldArena世界模型榜首** - WorldArena的评测权威性尚未被广泛验证,且“世界模型”概念本身缺乏统一标准,需要看后续是否有独立第三方复现其能力。 \ No newline at end of file diff --git a/script/run_meta.json b/script/run_meta.json deleted file mode 100644 index eba646f..0000000 --- a/script/run_meta.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "date": "2026-06-04", - "slug": "ai-2026-06-04", - "blog_url": "https://blog.ephron.ren/posts/ai-2026-06-04", - "public_ok": true, - "errors": [ - "橘鸦AI早报(重试): TimeoutError" - ], - "aihot_sections": [ - "模型发布/更新", - "产品发布/更新", - "行业动态", - "论文研究", - "技巧与观点" - ], - "raw_item_count": 39, - "stage0_count": 39, - "final_item_count": 38, - "has_juya": false, - "source_counts": { - "AI HOT": 32, - "InfoQ AI": 2, - "MIT科技评论AI": 0, - "量子位": 5, - "橘鸦AI早报": 0 - }, - "featured_titles": [ - "Grok Imagine 1.5 预览版发布", - "MiniMax M3 1M token 解码加速 15.6 倍", - "Miso One 开源语音模型:8B 参数、110ms 延迟、一次语音克隆", - "Ideogram v4.0 发布:2K 分辨率和 JSON 提示支持", - "Meta 面向 WhatsApp Business 的 AI 智能体现已全球上线", - "NousResearch 发布 Hermes Agent 桌面应用公测版" - ] -} \ No newline at end of file diff --git a/skill/scripts/.gitkeep b/skill/scripts/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/skill/scripts/.gitkeep @@ -0,0 +1 @@ + diff --git a/skill/scripts/run_daily_report.py b/skill/scripts/run_daily_report.py new file mode 100644 index 0000000..033bcda --- /dev/null +++ b/skill/scripts/run_daily_report.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 +from ai_daily_report.cli import main + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/tests/fixtures/.gitkeep b/tests/fixtures/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/fixtures/.gitkeep @@ -0,0 +1 @@ + diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..3372679 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,47 @@ +import unittest +from pathlib import Path +from tempfile import TemporaryDirectory + +from ai_daily_report.cli import build_parser, main + + +class CliTests(unittest.TestCase): + def test_run_command_parses_date_and_mode(self): + parser = build_parser() + + args = parser.parse_args(["run", "--date", "2026-06-04", "--mode", "dry-run", "--source-mode", "live", "--llm-mode", "live", "--sources-path", "config/sources.json"]) + + self.assertEqual(args.command, "run") + self.assertEqual(args.date, "2026-06-04") + self.assertEqual(args.mode, "dry-run") + self.assertEqual(args.source_mode, "live") + self.assertEqual(args.llm_mode, "live") + self.assertEqual(args.sources_path, "config/sources.json") + + def test_main_returns_zero_for_parseable_command(self): + self.assertEqual(main(["run", "--date", "2026-06-04", "--mode", "dry-run"]), 0) + + def test_main_mock_run_writes_outputs(self): + with TemporaryDirectory() as temp_dir: + exit_code = main( + [ + "run", + "--date", + "2026-06-04", + "--mode", + "dry-run", + "--source-mode", + "mock", + "--llm-mode", + "mock", + "--out-dir", + temp_dir, + ] + ) + + self.assertEqual(exit_code, 0) + self.assertTrue((Path(temp_dir) / "2026-06-04" / "blog_markdown.md").exists()) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_clients.py b/tests/test_clients.py new file mode 100644 index 0000000..ccf9e9d --- /dev/null +++ b/tests/test_clients.py @@ -0,0 +1,47 @@ +import json +import unittest +from unittest.mock import patch + +from ai_daily_report.clients import BlogApiClient, OpenAICompatibleClient, fetch_text + + +class FakeResponse: + status = 200 + + def __init__(self, body): + self.body = body + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def read(self): + return self.body + + +class ClientTests(unittest.TestCase): + def test_fetch_text_decodes_response(self): + with patch("urllib.request.urlopen", return_value=FakeResponse("ok".encode("utf-8"))): + self.assertEqual(fetch_text("https://example.com", 1), "ok") + + def test_openai_compatible_client_returns_message_content(self): + body = json.dumps({"choices": [{"message": {"content": "hello"}}]}).encode("utf-8") + with patch("urllib.request.urlopen", return_value=FakeResponse(body)): + client = OpenAICompatibleClient(api_key="key", base_url="https://llm.example/v1", model="model") + self.assertEqual(client.chat("prompt"), "hello") + + def test_blog_api_client_create_and_publish(self): + responses = [ + FakeResponse(json.dumps({"slug": "ai-2026-06-04"}).encode("utf-8")), + FakeResponse(json.dumps({"ok": True}).encode("utf-8")), + ] + with patch("urllib.request.urlopen", side_effect=responses): + client = BlogApiClient(base_url="https://blog.example", token="token") + self.assertEqual(client.create_post({"title": "t"})["slug"], "ai-2026-06-04") + client.publish_post("ai-2026-06-04") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_config_loading.py b/tests/test_config_loading.py new file mode 100644 index 0000000..cf80a19 --- /dev/null +++ b/tests/test_config_loading.py @@ -0,0 +1,27 @@ +import unittest +from pathlib import Path + +from ai_daily_report.config import load_source_configs +from ai_daily_report.sources.registry import get_source_fetcher + + +ROOT = Path(__file__).resolve().parents[1] + + +class ConfigLoadingTests(unittest.TestCase): + def test_load_source_configs_from_json(self): + configs = load_source_configs(ROOT / "config" / "sources.json") + + self.assertGreaterEqual(len(configs), 5) + self.assertEqual(configs[0].name, "AI HOT") + self.assertEqual(configs[0].type, "aihot") + + def test_all_configured_source_types_are_registered(self): + configs = load_source_configs(ROOT / "config" / "sources.json") + + for config in configs: + self.assertTrue(callable(get_source_fetcher(config.type))) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_dry_run_config.py b/tests/test_dry_run_config.py new file mode 100644 index 0000000..bc32cd6 --- /dev/null +++ b/tests/test_dry_run_config.py @@ -0,0 +1,33 @@ +import importlib.util +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +SCRIPT = ROOT / "script" / "ai_daily_blog_pipeline.py" + + +def load_pipeline_module(): + spec = importlib.util.spec_from_file_location("ai_daily_blog_pipeline", SCRIPT) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +class DryRunConfigTests(unittest.TestCase): + def test_dry_run_does_not_require_blog_token(self): + module = load_pipeline_module() + + self.assertTrue(module.is_dry_run({"AI_DAILY_DRY_RUN": "1"})) + self.assertFalse(module.requires_blog_token({"AI_DAILY_DRY_RUN": "1"})) + + def test_publish_mode_requires_blog_token(self): + module = load_pipeline_module() + + self.assertFalse(module.is_dry_run({})) + self.assertTrue(module.requires_blog_token({})) + + +if __name__ == "__main__": + unittest.main() + diff --git a/tests/test_env_config.py b/tests/test_env_config.py new file mode 100644 index 0000000..cc452f6 --- /dev/null +++ b/tests/test_env_config.py @@ -0,0 +1,87 @@ +import unittest +from pathlib import Path +from tempfile import TemporaryDirectory + +from ai_daily_report.env import resolve_blog_token, resolve_llm_config + + +class EnvConfigTests(unittest.TestCase): + def test_resolve_llm_config_prefers_generic_values(self): + config = resolve_llm_config( + { + "LLM_API_KEY": "generic-key", + "LLM_BASE_URL": "https://generic.example/v1", + "LLM_MODEL": "generic-model", + "SUB2API_API_KEY": "sub-key", + "SUB2API_BASE_URL": "https://sub.example/v1", + "SUB2API_MODEL": "sub-model", + } + ) + + self.assertEqual( + config, + { + "api_key": "generic-key", + "base_url": "https://generic.example/v1", + "model": "generic-model", + }, + ) + + def test_resolve_llm_config_reports_missing_fields(self): + with self.assertRaisesRegex(ValueError, "missing_llm_config: LLM_BASE_URL,LLM_MODEL"): + resolve_llm_config({"LLM_API_KEY": "key"}) + + def test_resolve_llm_config_follows_hermes_provider_config(self): + with TemporaryDirectory() as temp_dir: + hermes_dir = Path(temp_dir) + (hermes_dir / "config.yaml").write_text( + """ +model: + provider: sub2api + default: findmini/gpt-5.5 + base_url: http://sub2api.example/v1 +""".strip(), + encoding="utf-8", + ) + (hermes_dir / ".env").write_text("SUB2API_API_KEY=hermes-key\n", encoding="utf-8") + + config = resolve_llm_config({}, hermes_dir=hermes_dir) + + self.assertEqual( + config, + { + "api_key": "hermes-key", + "base_url": "http://sub2api.example/v1", + "model": "findmini/gpt-5.5", + }, + ) + + def test_resolve_llm_config_uses_hermes_auth_json_env_source(self): + with TemporaryDirectory() as temp_dir: + hermes_dir = Path(temp_dir) + (hermes_dir / "config.yaml").write_text( + """ +model: + provider: sub2api + default: findmini/gpt-5.5 + base_url: http://sub2api.example/v1 +""".strip(), + encoding="utf-8", + ) + (hermes_dir / "auth.json").write_text( + '{"credential_pool": {"sub2api": [{"source": "env:SUB2API_API_KEY"}]}}', + encoding="utf-8", + ) + + config = resolve_llm_config({"SUB2API_API_KEY": "auth-env-key"}, hermes_dir=hermes_dir) + + self.assertEqual(config["api_key"], "auth-env-key") + self.assertEqual(config["base_url"], "http://sub2api.example/v1") + self.assertEqual(config["model"], "findmini/gpt-5.5") + + def test_resolve_blog_token_uses_supported_names(self): + self.assertEqual(resolve_blog_token({"EPHRON_SERVICE_TOKEN": "token"}), "token") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_env_loading.py b/tests/test_env_loading.py new file mode 100644 index 0000000..38d28f8 --- /dev/null +++ b/tests/test_env_loading.py @@ -0,0 +1,39 @@ +import importlib.util +import os +import unittest +from pathlib import Path +from unittest.mock import patch + + +ROOT = Path(__file__).resolve().parents[1] +SCRIPT = ROOT / "script" / "ai_daily_blog_pipeline.py" + + +def load_pipeline_module(): + spec = importlib.util.spec_from_file_location("ai_daily_blog_pipeline", SCRIPT) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +class EnvLoadingTests(unittest.TestCase): + def test_project_env_is_loaded_and_process_env_wins(self): + module = load_pipeline_module() + env_text = "LLM_MODEL=file-model\nLLM_BASE_URL=https://file.example/v1\n" + + with patch.object(module.Path, "home", return_value=ROOT / "missing-home"): + with patch.dict(os.environ, {"LLM_MODEL": "process-model"}, clear=False): + with patch.object(module, "PROJECT_ENV_PATH", ROOT / ".env.test"): + (ROOT / ".env.test").write_text(env_text, encoding="utf-8") + try: + env = module.load_env() + finally: + (ROOT / ".env.test").unlink(missing_ok=True) + + self.assertEqual(env["LLM_BASE_URL"], "https://file.example/v1") + self.assertEqual(env["LLM_MODEL"], "process-model") + + +if __name__ == "__main__": + unittest.main() + diff --git a/tests/test_legacy_script_delegation.py b/tests/test_legacy_script_delegation.py new file mode 100644 index 0000000..7c24e61 --- /dev/null +++ b/tests/test_legacy_script_delegation.py @@ -0,0 +1,57 @@ +import importlib.util +import unittest +from pathlib import Path +from unittest.mock import patch + + +ROOT = Path(__file__).resolve().parents[1] +SCRIPT = ROOT / "script" / "ai_daily_blog_pipeline.py" + + +def load_pipeline_module(): + spec = importlib.util.spec_from_file_location("ai_daily_blog_pipeline", SCRIPT) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +class LegacyScriptDelegationTests(unittest.TestCase): + def test_main_delegates_to_new_pipeline_by_default(self): + module = load_pipeline_module() + calls = [] + + def fake_run_daily_report(**kwargs): + calls.append(kwargs) + return {"reports": {"stage8": {"status": "ok"}}} + + with patch.object(module, "load_env", return_value={"AI_DAILY_DRY_RUN": "1"}): + with patch("ai_daily_report.runner.run_daily_report", side_effect=fake_run_daily_report): + module.main() + + self.assertEqual(len(calls), 1) + self.assertEqual(calls[0]["mode"], "dry-run") + self.assertEqual(calls[0]["source_mode"], "live") + self.assertEqual(calls[0]["llm_mode"], "live") + + def test_main_allows_mock_modes_for_local_test(self): + module = load_pipeline_module() + calls = [] + + def fake_run_daily_report(**kwargs): + calls.append(kwargs) + return {"reports": {"stage8": {"status": "ok"}}} + + with patch.object( + module, + "load_env", + return_value={"AI_DAILY_DRY_RUN": "1", "AI_DAILY_SOURCE_MODE": "mock", "AI_DAILY_LLM_MODE": "mock"}, + ): + with patch("ai_daily_report.runner.run_daily_report", side_effect=fake_run_daily_report): + module.main() + + self.assertEqual(calls[0]["source_mode"], "mock") + self.assertEqual(calls[0]["llm_mode"], "mock") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_llm_utils.py b/tests/test_llm_utils.py new file mode 100644 index 0000000..fa73cd3 --- /dev/null +++ b/tests/test_llm_utils.py @@ -0,0 +1,17 @@ +import unittest + +from ai_daily_report.llm import parse_json_object + + +class LlmUtilsTests(unittest.TestCase): + def test_parse_json_object_strips_markdown_fence(self): + self.assertEqual(parse_json_object('```json\n{"ok": true}\n```'), {"ok": True}) + + def test_parse_json_object_raises_without_json(self): + with self.assertRaises(ValueError): + parse_json_object("not json") + + +if __name__ == "__main__": + unittest.main() + diff --git a/tests/test_markdown_rendering.py b/tests/test_markdown_rendering.py new file mode 100644 index 0000000..205f379 --- /dev/null +++ b/tests/test_markdown_rendering.py @@ -0,0 +1,39 @@ +import unittest + +from ai_daily_report.assemble import assemble_markdown +from ai_daily_report.models import NewsItem + + +class MarkdownRenderingTests(unittest.TestCase): + def test_blog_markdown_strips_double_blockquote_and_reference_markers(self): + items = [ + NewsItem( + id="a", + source_group="AI HOT", + source_label="OpenAI:Blog", + source_role="primary", + source_priority=10, + title_raw="测试模型发布", + title_norm="测试模型发布", + summary_raw="测试摘要", + title="测试模型发布", + summary="测试摘要", + url="https://openai.com/blog/test", + canonical_url="https://openai.com/blog/test", + section="模型与能力", + ) + ] + guide = {"theme": "> 主线判断:测试主线[1]", "threads": []} + + md, _ = assemble_markdown(items, guide) + + self.assertIn("## 导览", md) + self.assertIn("## 模型与能力", md) + self.assertIn("[OpenAI:Blog ↗](https://openai.com/blog/test)", md) + self.assertNotIn("> >", md) + self.assertNotIn("[1]", md) + self.assertNotIn("主线判断", md) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_project_structure.py b/tests/test_project_structure.py new file mode 100644 index 0000000..47a71c7 --- /dev/null +++ b/tests/test_project_structure.py @@ -0,0 +1,33 @@ +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] + + +class ProjectStructureTests(unittest.TestCase): + def test_pipeline_plan_structure_exists(self): + expected_paths = [ + "ai_daily_report/sources/__init__.py", + "ai_daily_report/sources/aihot.py", + "ai_daily_report/sources/rss.py", + "ai_daily_report/sources/juya.py", + "ai_daily_report/sources/registry.py", + "ai_daily_report/llm.py", + "ai_daily_report/validate.py", + "ai_daily_report/publish.py", + "ai_daily_report/cli.py", + "config/sources.json", + "config/pipeline.json", + "tests/fixtures/.gitkeep", + "skill/scripts/.gitkeep", + "skill/scripts/run_daily_report.py", + ] + + missing = [path for path in expected_paths if not (ROOT / path).exists()] + + self.assertEqual(missing, []) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_runner.py b/tests/test_runner.py new file mode 100644 index 0000000..5086f91 --- /dev/null +++ b/tests/test_runner.py @@ -0,0 +1,132 @@ +import unittest +import json +from pathlib import Path +from tempfile import TemporaryDirectory + +from ai_daily_report.runner import run_daily_report + + +class RunnerTests(unittest.TestCase): + def test_run_daily_report_mock_mode_writes_markdown_and_reports(self): + with TemporaryDirectory() as temp_dir: + result = run_daily_report( + run_date="2026-06-04", + mode="dry-run", + source_mode="mock", + llm_mode="mock", + out_dir=Path(temp_dir), + base_url="https://blog.example", + ) + + run_dir = Path(result["run_dir"]) + self.assertTrue((run_dir / "blog_markdown.md").exists()) + self.assertTrue((run_dir / "run_report.json").exists()) + self.assertEqual(result["reports"]["stage8"]["status"], "ok") + + def test_run_daily_report_live_sources_can_use_config_and_fetch_text(self): + with TemporaryDirectory() as temp_dir: + out_dir = Path(temp_dir) / "out" + source_config = Path(temp_dir) / "sources.json" + source_config.write_text( + json.dumps( + [ + { + "name": "InfoQ AI", + "type": "rss", + "url": "https://feed.example/rss", + "role": "supplement", + "priority": 40, + "enabled": True, + } + ] + ), + encoding="utf-8", + ) + + def fetch_text(url, timeout): + return """GPT-5 API 发布https://example.com/gpt5OpenAI 发布 GPT-5 API。""" + + result = run_daily_report( + run_date="2026-06-04", + mode="dry-run", + source_mode="live", + llm_mode="mock", + out_dir=out_dir, + base_url="https://blog.example", + sources_path=source_config, + fetch_text=fetch_text, + ) + + self.assertEqual(result["reports"]["stage0"]["raw_item_count"], 1) + self.assertTrue((out_dir / "2026-06-04" / "blog_markdown.md").exists()) + + def test_run_daily_report_live_llm_uses_env_config_in_dry_run(self): + class FakeLlmClient: + def __init__(self): + self.prompts = [] + + def chat(self, prompt): + self.prompts.append(prompt) + if "duplicate_groups" in prompt: + return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []}) + if "rewrites" in prompt: + payload = json.loads(prompt) + return json.dumps( + { + "rewrites": [ + { + "id": item["id"], + "title": item["title_raw"], + "summary": item["summary_raw"], + "flags": [], + } + for item in payload["items"] + ] + } + ) + return json.dumps( + { + "theme": "模型能力继续进入产品入口。", + "threads": [ + { + "title": "模型 API 更新", + "text": "GPT-5 API 发布,说明模型能力继续进入产品入口。", + "item_ids": [json.loads(prompt)["items"][0]["id"]], + "kind": "thread", + } + ], + } + ) + + fake_client = FakeLlmClient() + captured_config = {} + + def llm_client_factory(**config): + captured_config.update(config) + return fake_client + + with TemporaryDirectory() as temp_dir: + result = run_daily_report( + run_date="2026-06-04", + mode="dry-run", + source_mode="mock", + llm_mode="live", + out_dir=Path(temp_dir), + base_url="https://blog.example", + env={ + "LLM_API_KEY": "test-key", + "LLM_BASE_URL": "https://llm.example/v1", + "LLM_MODEL": "test-model", + }, + llm_client_factory=llm_client_factory, + ) + + self.assertEqual(captured_config["api_key"], "test-key") + self.assertEqual(captured_config["base_url"], "https://llm.example/v1") + self.assertEqual(captured_config["model"], "test-model") + self.assertGreaterEqual(len(fake_client.prompts), 2) + self.assertEqual(result["reports"]["stage8"]["status"], "ok") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_source_labels.py b/tests/test_source_labels.py new file mode 100644 index 0000000..9652691 --- /dev/null +++ b/tests/test_source_labels.py @@ -0,0 +1,55 @@ +import unittest + +from ai_daily_report.models import SourceConfig +from ai_daily_report.sources.juya import parse_juya_rss +from ai_daily_report.sources.labels import source_label_from_url + + +class SourceLabelTests(unittest.TestCase): + def test_source_label_from_x_url_includes_handle(self): + self.assertEqual( + source_label_from_url("https://x.com/MiniMax_AI/status/123", fallback="橘鸦AI早报"), + "X:MiniMax (@MiniMax_AI)", + ) + + def test_source_label_from_blog_url_marks_blog(self): + self.assertEqual( + source_label_from_url("https://openai.com/blog/example", fallback="橘鸦AI早报"), + "OpenAI:Blog", + ) + + def test_source_label_from_known_non_blog_domains(self): + self.assertEqual( + source_label_from_url("https://mp.weixin.qq.com/s/example", fallback="橘鸦AI早报"), + "微信公众号", + ) + self.assertEqual( + source_label_from_url("https://platform.minimaxi.com/docs/token-plan/migration", fallback="橘鸦AI早报"), + "MiniMax:Docs", + ) + + def test_parse_juya_rss_uses_item_url_as_source_label(self): + config = SourceConfig(name="橘鸦AI早报", type="juya_rss", url="https://juya.example/rss") + xml = """ + + + + 2026-06-04 + MiniMax M3 加速 #1 +

MiniMax M3 加速。

+

来源

+
+ ]]>
+
+
+
""" + + items = parse_juya_rss(config, xml, "2026-06-04") + + self.assertEqual(items[0]["source_label"], "X:MiniMax (@MiniMax_AI)") + self.assertNotEqual(items[0]["source_label"], "橘鸦AI早报") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_stage0_collect.py b/tests/test_stage0_collect.py new file mode 100644 index 0000000..7d31c20 --- /dev/null +++ b/tests/test_stage0_collect.py @@ -0,0 +1,49 @@ +import unittest + +from ai_daily_report.collect import collect_sources +from ai_daily_report.models import SourceConfig + + +class Stage0CollectTests(unittest.TestCase): + def test_collect_sources_returns_structured_results_for_each_source(self): + configs = [ + SourceConfig(name="Primary", type="fake", role="primary", priority=10), + SourceConfig(name="Supplement", type="fake", role="supplement", priority=20), + ] + + def fetcher(config, run_date): + return [{"title_raw": f"{config.name} item", "url": f"https://example.com/{config.name}"}] + + results, report = collect_sources(configs, "2026-06-04", fetcher=fetcher) + + self.assertEqual([r.source for r in results], ["Primary", "Supplement"]) + self.assertTrue(all(r.ok for r in results)) + self.assertEqual(sum(len(r.items) for r in results), 2) + self.assertEqual(report["input_source_count"], 2) + self.assertEqual(report["ok_source_count"], 2) + self.assertEqual(report["raw_item_count"], 2) + + def test_collect_sources_records_failed_source_without_blocking_others(self): + configs = [ + SourceConfig(name="Broken", type="fake", role="supplement", priority=20), + SourceConfig(name="Healthy", type="fake", role="supplement", priority=30), + ] + + def fetcher(config, run_date): + if config.name == "Broken": + raise TimeoutError("timed out") + return [{"title_raw": "healthy item", "url": "https://example.com/healthy"}] + + results, report = collect_sources(configs, "2026-06-04", fetcher=fetcher) + + by_source = {r.source: r for r in results} + self.assertFalse(by_source["Broken"].ok) + self.assertEqual(by_source["Broken"].status, "timeout") + self.assertIn("TimeoutError", by_source["Broken"].error) + self.assertTrue(by_source["Healthy"].ok) + self.assertEqual(report["failed_source_count"], 1) + self.assertEqual(report["raw_item_count"], 1) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_stage0_to_2_pipeline.py b/tests/test_stage0_to_2_pipeline.py new file mode 100644 index 0000000..03469e3 --- /dev/null +++ b/tests/test_stage0_to_2_pipeline.py @@ -0,0 +1,32 @@ +import unittest + +from ai_daily_report.pipeline import run_stage0_to_stage2 + + +class Stage0To2PipelineTests(unittest.TestCase): + def test_run_stage0_to_stage2_returns_deduped_items_and_reports(self): + configs = [ + {"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}, + {"name": "RSS", "type": "fake", "role": "supplement", "priority": 50}, + ] + + def fetcher(config, run_date): + return [ + { + "title_raw": "OpenAI 发布 GPT-5", + "summary_raw": f"{config.name} summary", + "url": "https://openai.com/blog/gpt-5?utm_source=test", + "source_label": config.name, + } + ] + + result = run_stage0_to_stage2(configs, "2026-06-04", fetcher=fetcher) + + self.assertEqual(len(result["items"]), 1) + self.assertEqual(result["reports"]["stage0"]["raw_item_count"], 2) + self.assertEqual(result["reports"]["stage1"]["output_count"], 2) + self.assertEqual(result["reports"]["stage2"]["removed_count"], 1) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_stage0_to_4_pipeline.py b/tests/test_stage0_to_4_pipeline.py new file mode 100644 index 0000000..334c09a --- /dev/null +++ b/tests/test_stage0_to_4_pipeline.py @@ -0,0 +1,66 @@ +import json +import unittest + +from ai_daily_report.pipeline import run_stage0_to_stage4 + + +class Stage0To4PipelineTests(unittest.TestCase): + def test_run_stage0_to_stage4_semantic_dedupes_and_rewrites(self): + configs = [ + {"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}, + {"name": "RSS", "type": "fake", "role": "supplement", "priority": 50}, + ] + + def fetcher(config, run_date): + return [ + { + "title_raw": f"{config.name} Anthropic IPO", + "summary_raw": f"{config.name} reports Anthropic IPO filing.", + "url": f"https://example.com/{config.name}", + "source_label": config.name, + } + ] + + def semantic_llm_call(prompt): + return json.dumps( + { + "duplicate_groups": [], + "not_duplicates": [], + "uncertain": [], + } + ) + + def rewrite_llm_call(prompt): + payload = json.loads(prompt) + return json.dumps( + { + "rewrites": [ + { + "id": entry["id"], + "title": "Anthropic 提交 IPO 文件", + "summary": "Anthropic 被报道提交 IPO 文件。", + "flags": [], + } + for entry in payload["items"] + ] + }, + ensure_ascii=False, + ) + + result = run_stage0_to_stage4( + configs, + "2026-06-04", + fetcher=fetcher, + semantic_llm_call=semantic_llm_call, + rewrite_llm_call=rewrite_llm_call, + ) + + self.assertEqual(len(result["items"]), 2) + self.assertEqual(result["items"][0].title, "Anthropic 提交 IPO 文件") + self.assertIn("stage3", result["reports"]) + self.assertIn("stage4", result["reports"]) + self.assertEqual(result["reports"]["stage4"]["rewritten_count"], 2) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_stage0_to_5_pipeline.py b/tests/test_stage0_to_5_pipeline.py new file mode 100644 index 0000000..2df7038 --- /dev/null +++ b/tests/test_stage0_to_5_pipeline.py @@ -0,0 +1,62 @@ +import json +import unittest + +from ai_daily_report.pipeline import run_stage0_to_stage5 + + +class Stage0To5PipelineTests(unittest.TestCase): + def test_run_stage0_to_stage5_classifies_and_orders_items(self): + configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}] + + def fetcher(config, run_date): + return [ + { + "title_raw": "Anthropic 提交 IPO 文件", + "summary_raw": "Anthropic 被报道提交 IPO 文件。", + "url": "https://example.com/ipo", + "source_label": config.name, + }, + { + "title_raw": "GPT-5 API 发布,延迟降低 30%", + "summary_raw": "OpenAI 发布 GPT-5 API。", + "url": "https://example.com/gpt5", + "source_label": config.name, + "section_hint": "模型发布/更新", + }, + ] + + def semantic_llm_call(prompt): + return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []}) + + def rewrite_llm_call(prompt): + payload = json.loads(prompt) + return json.dumps( + { + "rewrites": [ + { + "id": entry["id"], + "title": entry["title_raw"], + "summary": entry["summary_raw"], + "flags": [], + } + for entry in payload["items"] + ] + }, + ensure_ascii=False, + ) + + result = run_stage0_to_stage5( + configs, + "2026-06-04", + fetcher=fetcher, + semantic_llm_call=semantic_llm_call, + rewrite_llm_call=rewrite_llm_call, + ) + + self.assertEqual([item.section for item in result["items"]], ["模型与能力", "公司与资本"]) + self.assertEqual(result["reports"]["stage5"]["section_counts"]["模型与能力"], 1) + self.assertEqual(result["reports"]["stage5"]["section_counts"]["公司与资本"], 1) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_stage0_to_6_pipeline.py b/tests/test_stage0_to_6_pipeline.py new file mode 100644 index 0000000..4be2807 --- /dev/null +++ b/tests/test_stage0_to_6_pipeline.py @@ -0,0 +1,75 @@ +import json +import unittest + +from ai_daily_report.pipeline import run_stage0_to_stage6 + + +class Stage0To6PipelineTests(unittest.TestCase): + def test_run_stage0_to_stage6_generates_guide(self): + configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}] + + def fetcher(config, run_date): + return [ + { + "title_raw": "GPT-5 API 发布", + "summary_raw": "OpenAI 发布 GPT-5 API。", + "url": "https://example.com/gpt5", + "source_label": config.name, + "section_hint": "模型发布/更新", + } + ] + + def semantic_llm_call(prompt): + return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []}) + + def rewrite_llm_call(prompt): + payload = json.loads(prompt) + return json.dumps( + { + "rewrites": [ + { + "id": entry["id"], + "title": entry["title_raw"], + "summary": entry["summary_raw"], + "flags": [], + } + for entry in payload["items"] + ] + }, + ensure_ascii=False, + ) + + def guide_llm_call(prompt): + payload = json.loads(prompt) + item_id = payload["items"][0]["id"] + return json.dumps( + { + "theme": "模型 API 能力继续更新。", + "threads": [ + { + "title": "模型能力更新", + "text": "GPT-5 API 发布,体现模型能力继续产品化。", + "item_ids": [item_id], + "kind": "thread", + } + ], + }, + ensure_ascii=False, + ) + + result = run_stage0_to_stage6( + configs, + "2026-06-04", + fetcher=fetcher, + semantic_llm_call=semantic_llm_call, + rewrite_llm_call=rewrite_llm_call, + guide_llm_call=guide_llm_call, + ) + + self.assertEqual(result["guide"]["theme"], "模型 API 能力继续更新。") + self.assertEqual(len(result["guide"]["threads"]), 1) + self.assertTrue(result["reports"]["stage6"]["theme_present"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_stage0_to_7_pipeline.py b/tests/test_stage0_to_7_pipeline.py new file mode 100644 index 0000000..b86e078 --- /dev/null +++ b/tests/test_stage0_to_7_pipeline.py @@ -0,0 +1,76 @@ +import json +import unittest + +from ai_daily_report.pipeline import run_stage0_to_stage7 + + +class Stage0To7PipelineTests(unittest.TestCase): + def test_run_stage0_to_stage7_assembles_markdown(self): + configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}] + + def fetcher(config, run_date): + return [ + { + "title_raw": "GPT-5 API 发布", + "summary_raw": "OpenAI 发布 GPT-5 API。", + "url": "https://example.com/gpt5", + "source_label": "OpenAI:Blog", + "section_hint": "模型发布/更新", + } + ] + + def semantic_llm_call(prompt): + return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []}) + + def rewrite_llm_call(prompt): + payload = json.loads(prompt) + return json.dumps( + { + "rewrites": [ + { + "id": entry["id"], + "title": entry["title_raw"], + "summary": entry["summary_raw"], + "flags": [], + } + for entry in payload["items"] + ] + }, + ensure_ascii=False, + ) + + def guide_llm_call(prompt): + payload = json.loads(prompt) + item_id = payload["items"][0]["id"] + return json.dumps( + { + "theme": "模型 API 能力继续更新。", + "threads": [ + { + "title": "模型能力产品化", + "text": "GPT-5 API 发布,说明模型能力继续进入产品入口。", + "item_ids": [item_id], + "kind": "thread", + } + ], + }, + ensure_ascii=False, + ) + + result = run_stage0_to_stage7( + configs, + "2026-06-04", + fetcher=fetcher, + semantic_llm_call=semantic_llm_call, + rewrite_llm_call=rewrite_llm_call, + guide_llm_call=guide_llm_call, + ) + + self.assertIn("## 导览", result["markdown"]) + self.assertIn("## 模型与能力", result["markdown"]) + self.assertIn("## 今日脉络", result["markdown"]) + self.assertEqual(result["reports"]["stage7"]["blocking_errors"], []) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_stage0_to_8_pipeline.py b/tests/test_stage0_to_8_pipeline.py new file mode 100644 index 0000000..a81861c --- /dev/null +++ b/tests/test_stage0_to_8_pipeline.py @@ -0,0 +1,79 @@ +import json +import unittest + +from ai_daily_report.pipeline import run_stage0_to_stage8 + + +class Stage0To8PipelineTests(unittest.TestCase): + def test_run_stage0_to_stage8_dry_run_publishes_report(self): + configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}] + + def fetcher(config, run_date): + return [ + { + "title_raw": "GPT-5 API 发布", + "summary_raw": "OpenAI 发布 GPT-5 API。", + "url": "https://example.com/gpt5", + "source_label": "OpenAI:Blog", + "section_hint": "模型发布/更新", + } + ] + + def semantic_llm_call(prompt): + return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []}) + + def rewrite_llm_call(prompt): + payload = json.loads(prompt) + return json.dumps( + { + "rewrites": [ + { + "id": entry["id"], + "title": entry["title_raw"], + "summary": entry["summary_raw"], + "flags": [], + } + for entry in payload["items"] + ] + }, + ensure_ascii=False, + ) + + def guide_llm_call(prompt): + payload = json.loads(prompt) + item_id = payload["items"][0]["id"] + return json.dumps( + { + "theme": "模型 API 能力继续更新。", + "threads": [ + { + "title": "模型能力产品化", + "text": "GPT-5 API 发布,说明模型能力继续进入产品入口。", + "item_ids": [item_id], + "kind": "thread", + } + ], + }, + ensure_ascii=False, + ) + + result = run_stage0_to_stage8( + configs, + "2026-06-04", + fetcher=fetcher, + semantic_llm_call=semantic_llm_call, + rewrite_llm_call=rewrite_llm_call, + guide_llm_call=guide_llm_call, + mode="dry-run", + base_url="https://blog.example", + client=None, + ) + + self.assertEqual(result["publish"].status, "ok") + self.assertEqual(result["publish"].blog_url, "https://blog.example/posts/ai-2026-06-04") + self.assertIn("stage8", result["reports"]) + self.assertEqual(result["reports"]["stage8"]["status"], "ok") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_stage1_normalize.py b/tests/test_stage1_normalize.py new file mode 100644 index 0000000..d75358a --- /dev/null +++ b/tests/test_stage1_normalize.py @@ -0,0 +1,85 @@ +import unittest + +from ai_daily_report.models import SourceResult +from ai_daily_report.normalize import canonicalize_url, normalize_items, normalize_title + + +class Stage1NormalizeTests(unittest.TestCase): + def test_canonicalize_url_removes_tracking_and_normalizes_x_host(self): + url = "HTTPS://Twitter.com/OpenAI/status/123/?utm_source=newsletter&fbclid=abc#fragment" + + self.assertEqual(canonicalize_url(url), "https://x.com/OpenAI/status/123") + + def test_normalize_items_builds_news_items_with_ids_and_norms(self): + source_result = SourceResult( + source="AI HOT", + role="primary", + ok=True, + status="ok", + items=[ + { + "title_raw": " GPT-5 发布:速度提升 2x! ", + "summary_raw": "

OpenAI 发布更新。

", + "url": "https://openai.com/blog/gpt-5?utm_campaign=test", + "source_label": "OpenAI:Blog", + "section_hint": "模型发布/更新", + } + ], + ) + + items, report = normalize_items([source_result], run_date="2026-06-04") + + self.assertEqual(len(items), 1) + self.assertTrue(items[0].id.startswith("item_")) + self.assertEqual(items[0].canonical_url, "https://openai.com/blog/gpt-5") + self.assertEqual(items[0].title_norm, normalize_title("GPT-5 发布:速度提升 2x!")) + self.assertEqual(items[0].summary_raw, "OpenAI 发布更新。") + self.assertEqual(items[0].source_role, "primary") + self.assertEqual(report["input_count"], 1) + self.assertEqual(report["output_count"], 1) + + def test_normalize_items_marks_quality_flags_without_dropping_item(self): + source_result = SourceResult( + source="RSS", + role="supplement", + ok=True, + status="ok", + items=[{"title_raw": "短", "summary_raw": "", "url": ""}], + ) + + items, report = normalize_items([source_result], run_date="2026-06-04") + + self.assertEqual(len(items), 1) + self.assertIn("missing_url", items[0].quality_flags) + self.assertIn("missing_summary", items[0].quality_flags) + self.assertIn("short_title", items[0].quality_flags) + self.assertEqual(report["quality_flag_counts"]["missing_url"], 1) + + def test_normalize_items_keeps_ids_unique_for_same_canonical_url(self): + source_result = SourceResult( + source="AI HOT", + role="primary", + ok=True, + status="ok", + items=[ + { + "title_raw": "OpenAI 发布 GPT-5", + "summary_raw": "summary a", + "url": "https://example.com/news?utm_source=a", + }, + { + "title_raw": "OpenAI 发布 GPT-5", + "summary_raw": "summary b", + "url": "https://example.com/news", + }, + ], + ) + + items, _ = normalize_items([source_result], run_date="2026-06-04") + + self.assertEqual(len({item.id for item in items}), 2) + self.assertEqual(items[0].canonical_url, items[1].canonical_url) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_stage2_dedupe.py b/tests/test_stage2_dedupe.py new file mode 100644 index 0000000..0809889 --- /dev/null +++ b/tests/test_stage2_dedupe.py @@ -0,0 +1,63 @@ +import unittest + +from ai_daily_report.dedupe import hard_dedup_items +from ai_daily_report.models import NewsItem + + +def item( + item_id, + title, + title_norm, + url, + canonical_url, + source_group="AI HOT", + source_label="AI HOT", + source_priority=100, + summary="summary", +): + return NewsItem( + id=item_id, + source_group=source_group, + source_label=source_label, + source_role="primary" if source_group == "AI HOT" else "supplement", + source_priority=source_priority, + title_raw=title, + title_norm=title_norm, + summary_raw=summary, + url=url, + canonical_url=canonical_url, + ) + + +class Stage2DedupeTests(unittest.TestCase): + def test_hard_dedup_merges_same_canonical_url_and_keeps_better_item(self): + items = [ + item("a", "OpenAI 发布 GPT-5", "openai发布gpt5", "https://example.com/a?utm_source=x", "https://example.com/a", source_group="RSS", source_priority=50, summary="short"), + item("b", "OpenAI 发布 GPT-5", "openai发布gpt5", "https://example.com/a", "https://example.com/a", source_group="AI HOT", source_priority=10, summary="longer summary"), + ] + + deduped, report = hard_dedup_items(items) + + self.assertEqual([i.id for i in deduped], ["b"]) + self.assertEqual(report["input_count"], 2) + self.assertEqual(report["output_count"], 1) + self.assertEqual(report["removed_count"], 1) + self.assertEqual(report["groups"][0]["reason"], "same_canonical_url") + self.assertEqual(deduped[0].duplicate_sources[0]["source_group"], "RSS") + + def test_hard_dedup_marks_similar_titles_without_removing(self): + items = [ + item("a", "Grok API 上线 Cloudflare Gateway", "grokapi上线cloudflaregateway", "https://x.com/a", "https://x.com/a"), + item("b", "Grok 模型登陆 Cloudflare AI Gateway", "grok模型登陆cloudflareaigateway", "https://x.com/b", "https://x.com/b"), + ] + + deduped, report = hard_dedup_items(items) + + self.assertEqual(len(deduped), 2) + self.assertEqual(report["removed_count"], 0) + self.assertEqual(len(report["possible_duplicates"]), 1) + self.assertEqual(set(report["possible_duplicates"][0]["item_ids"]), {"a", "b"}) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_stage3_semantic_dedupe.py b/tests/test_stage3_semantic_dedupe.py new file mode 100644 index 0000000..ed876a5 --- /dev/null +++ b/tests/test_stage3_semantic_dedupe.py @@ -0,0 +1,129 @@ +import json +import unittest + +from ai_daily_report.models import NewsItem +from ai_daily_report.semantic_dedupe import semantic_dedup_items + + +def news_item(item_id, title, source_group="AI HOT"): + return NewsItem( + id=item_id, + source_group=source_group, + source_label=source_group, + source_role="primary" if source_group == "AI HOT" else "supplement", + source_priority=10 if source_group == "AI HOT" else 50, + title_raw=title, + title_norm=title.lower(), + summary_raw=f"{title} summary", + url=f"https://example.com/{item_id}", + canonical_url=f"https://example.com/{item_id}", + ) + + +class Stage3SemanticDedupeTests(unittest.TestCase): + def test_semantic_dedup_removes_only_high_confidence_duplicates(self): + items = [ + news_item("a", "Anthropic 提交 IPO 招股书", "AI HOT"), + news_item("b", "刚刚,Anthropic 提交了招股书", "量子位"), + news_item("c", "Grok 上线 Cloudflare Gateway", "AI HOT"), + ] + candidates = [{"item_ids": ["a", "b"], "reason": "title_similarity"}] + + def llm_call(prompt): + return json.dumps( + { + "duplicate_groups": [ + { + "keep_id": "a", + "remove_ids": ["b"], + "confidence": "high", + "reason": "same IPO filing event", + } + ], + "not_duplicates": [], + "uncertain": [], + } + ) + + deduped, report = semantic_dedup_items(items, candidates, llm_call=llm_call) + + self.assertEqual([item.id for item in deduped], ["a", "c"]) + self.assertEqual(report["removed_count"], 1) + self.assertEqual(report["duplicate_groups"][0]["reason"], "same IPO filing event") + self.assertEqual(deduped[0].duplicate_sources[0]["id"], "b") + + def test_semantic_dedup_skips_deletion_when_ratio_exceeds_limit(self): + items = [ + news_item("a", "A"), + news_item("b", "B"), + news_item("c", "C"), + ] + candidates = [{"item_ids": ["a", "b", "c"], "reason": "llm_candidate"}] + + def llm_call(prompt): + return json.dumps( + { + "duplicate_groups": [ + { + "keep_id": "a", + "remove_ids": ["b", "c"], + "confidence": "high", + "reason": "too broad", + } + ], + "not_duplicates": [], + "uncertain": [], + } + ) + + deduped, report = semantic_dedup_items( + items, + candidates, + llm_call=llm_call, + max_deletion_ratio=0.5, + ) + + self.assertEqual(len(deduped), 3) + self.assertEqual(report["removed_count"], 0) + self.assertTrue(report["skipped_for_deletion_ratio"]) + + def test_semantic_dedup_ignores_groups_outside_candidate_sets(self): + items = [ + news_item("a", "Suno 完成融资"), + news_item("b", "Suno 完成 D 轮融资"), + news_item("c", "Ideogram 发布 v4"), + news_item("d", "OpenClaw 发布新版"), + ] + candidates = [{"item_ids": ["a", "b"], "reason": "title_similarity"}] + + def llm_call(prompt): + return json.dumps( + { + "duplicate_groups": [ + { + "keep_id": "a", + "remove_ids": ["b"], + "confidence": "high", + "reason": "same Suno event", + }, + { + "keep_id": "c", + "remove_ids": ["d"], + "confidence": "high", + "reason": "not part of candidates", + }, + ], + "not_duplicates": [], + "uncertain": [], + } + ) + + deduped, report = semantic_dedup_items(items, candidates, llm_call=llm_call) + + self.assertEqual([item.id for item in deduped], ["a", "c", "d"]) + self.assertEqual(report["removed_count"], 1) + self.assertIn("group_outside_candidates", report["errors"][0]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_stage4_rewrite.py b/tests/test_stage4_rewrite.py new file mode 100644 index 0000000..62ef346 --- /dev/null +++ b/tests/test_stage4_rewrite.py @@ -0,0 +1,96 @@ +import json +import unittest + +from ai_daily_report.models import NewsItem +from ai_daily_report.rewrite import rewrite_items + + +def news_item(item_id="a"): + return NewsItem( + id=item_id, + source_group="AI HOT", + source_label="AI HOT", + source_role="primary", + source_priority=10, + title_raw="OpenAI launches GPT-5 API", + title_norm="openailaunchesgpt5api", + summary_raw="OpenAI launched the GPT-5 API with better latency.", + url="https://example.com/a", + canonical_url="https://example.com/a", + ) + + +class Stage4RewriteTests(unittest.TestCase): + def test_rewrite_items_writes_display_fields_without_overwriting_raw(self): + items = [news_item("a")] + + def llm_call(prompt): + return json.dumps( + { + "rewrites": [ + { + "id": "a", + "title": "OpenAI 发布 GPT-5 API", + "summary": "OpenAI 发布 GPT-5 API,延迟表现更好。", + "flags": [], + } + ] + }, + ensure_ascii=False, + ) + + rewritten, report = rewrite_items(items, llm_call=llm_call, batch_size=10) + + self.assertEqual(rewritten[0].title, "OpenAI 发布 GPT-5 API") + self.assertEqual(rewritten[0].summary, "OpenAI 发布 GPT-5 API,延迟表现更好。") + self.assertEqual(rewritten[0].title_raw, "OpenAI launches GPT-5 API") + self.assertEqual(report["rewritten_count"], 1) + self.assertEqual(report["fallback_count"], 0) + + def test_rewrite_items_falls_back_when_llm_fails(self): + items = [news_item("a")] + + def llm_call(prompt): + raise TimeoutError("slow") + + rewritten, report = rewrite_items(items, llm_call=llm_call, batch_size=10) + + self.assertEqual(rewritten[0].title, "OpenAI launches GPT-5 API") + self.assertEqual(rewritten[0].summary, "OpenAI launched the GPT-5 API with better latency.") + self.assertEqual(report["rewritten_count"], 0) + self.assertEqual(report["fallback_count"], 1) + self.assertIn("TimeoutError", report["errors"][0]) + + def test_rewrite_items_retries_failed_batch_as_single_items(self): + items = [news_item("a"), news_item("b")] + calls = [] + + def llm_call(prompt): + payload = json.loads(prompt) + ids = [item["id"] for item in payload["items"]] + calls.append(ids) + if len(ids) > 1: + return "not json" + return json.dumps( + { + "rewrites": [ + { + "id": ids[0], + "title": f"title {ids[0]}", + "summary": f"summary {ids[0]}", + "flags": [], + } + ] + } + ) + + rewritten, report = rewrite_items(items, llm_call=llm_call, batch_size=2) + + self.assertEqual([item.title for item in rewritten], ["title a", "title b"]) + self.assertEqual(report["rewritten_count"], 2) + self.assertEqual(report["fallback_count"], 0) + self.assertEqual(calls, [["a", "b"], ["a"], ["b"]]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_stage5_classify.py b/tests/test_stage5_classify.py new file mode 100644 index 0000000..a158ca3 --- /dev/null +++ b/tests/test_stage5_classify.py @@ -0,0 +1,61 @@ +import unittest + +from ai_daily_report.classify import SECTION_ORDER, classify_and_order_items +from ai_daily_report.models import NewsItem + + +def news_item(item_id, title, summary="", section_hint="", source_priority=50): + return NewsItem( + id=item_id, + source_group="AI HOT", + source_label="AI HOT", + source_role="primary", + source_priority=source_priority, + title_raw=title, + title_norm=title.lower(), + summary_raw=summary or f"{title} summary", + title=title, + summary=summary or f"{title} summary", + url=f"https://example.com/{item_id}", + canonical_url=f"https://example.com/{item_id}", + section_hint=section_hint, + ) + + +class Stage5ClassifyTests(unittest.TestCase): + def test_classify_maps_legacy_section_hints_to_new_sections(self): + items = [news_item("a", "GPT-5 发布", section_hint="模型发布/更新")] + + classified, report = classify_and_order_items(items) + + self.assertEqual(classified[0].section, "模型与能力") + self.assertEqual(report["hint_classified"], 1) + self.assertIn("模型与能力", SECTION_ORDER) + + def test_classify_uses_rules_when_hint_is_missing(self): + items = [ + news_item("a", "Anthropic 提交 IPO 文件", summary="Anthropic 计划上市并提交文件。"), + news_item("b", "MCP SDK 发布新版", summary="开发者可用新版 SDK 构建工具。"), + ] + + classified, report = classify_and_order_items(items) + by_id = {item.id: item for item in classified} + + self.assertEqual(by_id["a"].section, "公司与资本") + self.assertEqual(by_id["b"].section, "开发与基础设施") + self.assertEqual(report["rule_classified"], 2) + + def test_classify_orders_items_by_local_rank_score_within_sections(self): + items = [ + news_item("low", "普通模型更新", section_hint="模型发布/更新", source_priority=80), + news_item("high", "GPT-5 API 发布,延迟降低 30%", section_hint="模型发布/更新", source_priority=10), + ] + + classified, report = classify_and_order_items(items) + + self.assertEqual([item.id for item in classified], ["high", "low"]) + self.assertEqual(report["section_counts"]["模型与能力"], 2) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_stage6_guide.py b/tests/test_stage6_guide.py new file mode 100644 index 0000000..4399c4b --- /dev/null +++ b/tests/test_stage6_guide.py @@ -0,0 +1,77 @@ +import json +import unittest + +from ai_daily_report.guide import generate_guide +from ai_daily_report.models import NewsItem + + +def news_item(item_id, title, section="模型与能力"): + return NewsItem( + id=item_id, + source_group="AI HOT", + source_label="AI HOT", + source_role="primary", + source_priority=10, + title_raw=title, + title_norm=title.lower(), + summary_raw=f"{title} summary", + title=title, + summary=f"{title} summary", + url=f"https://example.com/{item_id}", + canonical_url=f"https://example.com/{item_id}", + section=section, + ) + + +class Stage6GuideTests(unittest.TestCase): + def test_generate_guide_returns_theme_and_valid_threads(self): + items = [ + news_item("a", "GPT-5 API 发布"), + news_item("b", "Miso One 开源语音模型"), + ] + + def llm_call(prompt): + return json.dumps( + { + "theme": "模型能力继续向 API 和实时语音两端推进。", + "threads": [ + { + "title": "模型能力继续推进", + "text": "GPT-5 API 和 Miso One 分别代表 API 能力和语音模型更新。", + "item_ids": ["a", "b"], + "kind": "thread", + }, + { + "title": "无效脉络", + "text": "这条引用了不存在的条目。", + "item_ids": ["missing"], + "kind": "thread", + }, + ], + }, + ensure_ascii=False, + ) + + guide, report = generate_guide(items, llm_call=llm_call) + + self.assertEqual(guide["theme"], "模型能力继续向 API 和实时语音两端推进。") + self.assertEqual(len(guide["threads"]), 1) + self.assertEqual(guide["threads"][0]["item_ids"], ["a", "b"]) + self.assertEqual(report["dropped_thread_count"], 1) + + def test_generate_guide_falls_back_when_llm_fails(self): + items = [news_item("a", "GPT-5 API 发布")] + + def llm_call(prompt): + raise TimeoutError("slow") + + guide, report = generate_guide(items, llm_call=llm_call) + + self.assertEqual(guide["theme"], "") + self.assertEqual(guide["threads"], []) + self.assertTrue(report["fallback_used"]) + self.assertIn("TimeoutError", report["errors"][0]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_stage7_assemble.py b/tests/test_stage7_assemble.py new file mode 100644 index 0000000..e79b7e1 --- /dev/null +++ b/tests/test_stage7_assemble.py @@ -0,0 +1,65 @@ +import unittest + +from ai_daily_report.assemble import assemble_markdown, validate_markdown +from ai_daily_report.models import NewsItem + + +def news_item(item_id, title, section): + return NewsItem( + id=item_id, + source_group="AI HOT", + source_label="OpenAI:Blog", + source_role="primary", + source_priority=10, + title_raw=title, + title_norm=title.lower(), + summary_raw=f"{title} summary", + title=title, + summary=f"{title} summary", + url=f"https://example.com/{item_id}", + canonical_url=f"https://example.com/{item_id}", + section=section, + ) + + +class Stage7AssembleTests(unittest.TestCase): + def test_assemble_markdown_renders_sections_and_daily_threads(self): + items = [ + news_item("a", "GPT-5 API 发布", "模型与能力"), + news_item("b", "Anthropic 提交 IPO 文件", "公司与资本"), + ] + guide = { + "theme": "> 模型和资本两条线都在推进。[1]", + "threads": [ + { + "title": "模型能力产品化", + "text": "GPT-5 API 发布,说明模型能力继续进入产品入口。", + "item_ids": ["a"], + "kind": "thread", + } + ], + } + + md, report = assemble_markdown(items, guide) + + self.assertIn("## 导览", md) + self.assertIn("> 模型和资本两条线都在推进。", md) + self.assertIn("## 模型与能力", md) + self.assertIn("**1. GPT-5 API 发布**", md) + self.assertIn("**2. Anthropic 提交 IPO 文件**", md) + self.assertIn("## 今日脉络", md) + self.assertIn("- **模型能力产品化**", md) + self.assertNotIn("> >", md) + self.assertNotIn("[1]", md) + self.assertEqual(report["item_count"], 2) + self.assertEqual(report["blocking_errors"], []) + + def test_validate_markdown_blocks_empty_report(self): + report = validate_markdown("", []) + + self.assertIn("no_items", report["blocking_errors"]) + self.assertIn("markdown_too_short", report["blocking_errors"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_stage8_publish.py b/tests/test_stage8_publish.py new file mode 100644 index 0000000..0f7e342 --- /dev/null +++ b/tests/test_stage8_publish.py @@ -0,0 +1,76 @@ +import unittest + +from ai_daily_report.publish import publish_markdown + + +class FakeBlogClient: + def __init__(self): + self.created_payload = None + self.published_slug = None + + def create_post(self, payload): + self.created_payload = payload + return {"slug": "ai-2026-06-04"} + + def publish_post(self, slug): + self.published_slug = slug + + +class Stage8PublishTests(unittest.TestCase): + def test_publish_markdown_dry_run_does_not_call_client(self): + result = publish_markdown( + title="AI日报 · 2026-06-04", + markdown="## 导览\n\n> ok", + tags=["AI日报"], + slug="ai-2026-06-04", + base_url="https://blog.example", + mode="dry-run", + markdown_report={"blocking_errors": []}, + client=None, + ) + + self.assertEqual(result.status, "ok") + self.assertEqual(result.mode, "dry-run") + self.assertEqual(result.blog_url, "https://blog.example/posts/ai-2026-06-04") + self.assertTrue(result.public_ok) + + def test_publish_markdown_blocks_when_markdown_has_errors(self): + client = FakeBlogClient() + + result = publish_markdown( + title="AI日报 · 2026-06-04", + markdown="bad", + tags=["AI日报"], + slug="ai-2026-06-04", + base_url="https://blog.example", + mode="publish", + markdown_report={"blocking_errors": ["markdown_too_short"]}, + client=client, + ) + + self.assertEqual(result.status, "blocked") + self.assertIsNone(client.created_payload) + self.assertIn("markdown_too_short", result.error) + + def test_publish_markdown_publish_mode_calls_client(self): + client = FakeBlogClient() + + result = publish_markdown( + title="AI日报 · 2026-06-04", + markdown="## 导览\n\n> ok", + tags=["AI日报"], + slug="ai-2026-06-04", + base_url="https://blog.example", + mode="publish", + markdown_report={"blocking_errors": []}, + client=client, + ) + + self.assertEqual(result.status, "ok") + self.assertEqual(client.created_payload["title"], "AI日报 · 2026-06-04") + self.assertEqual(client.published_slug, "ai-2026-06-04") + self.assertEqual(result.blog_url, "https://blog.example/posts/ai-2026-06-04") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_validate.py b/tests/test_validate.py new file mode 100644 index 0000000..48a42f3 --- /dev/null +++ b/tests/test_validate.py @@ -0,0 +1,14 @@ +import unittest + +from ai_daily_report.validate import validate_report_markdown + + +class ValidateTests(unittest.TestCase): + def test_validate_report_markdown_delegates_markdown_checks(self): + report = validate_report_markdown("", []) + + self.assertIn("no_items", report["blocking_errors"]) + + +if __name__ == "__main__": + unittest.main()