From 5a986962559352fd58e6f2aff6ed4f9416768e43 Mon Sep 17 00:00:00 2001
From: Mimikko-zeus
Date: Thu, 4 Jun 2026 15:21:56 +0800
Subject: [PATCH] Refactor AI daily report pipeline
---
.gitignore | 9 +
ai_daily_report/__init__.py | 2 +
ai_daily_report/assemble.py | 77 ++
ai_daily_report/classify.py | 109 ++
ai_daily_report/cli.py | 40 +
ai_daily_report/clients.py | 64 +
ai_daily_report/collect.py | 95 ++
ai_daily_report/config.py | 19 +
ai_daily_report/dedupe.py | 100 ++
ai_daily_report/env.py | 143 +++
ai_daily_report/guide.py | 113 ++
ai_daily_report/llm.py | 18 +
ai_daily_report/models.py | 53 +
ai_daily_report/normalize.py | 132 ++
ai_daily_report/pipeline.py | 219 ++++
ai_daily_report/publish.py | 90 ++
ai_daily_report/rewrite.py | 103 ++
ai_daily_report/runner.py | 156 +++
ai_daily_report/semantic_dedupe.py | 167 +++
ai_daily_report/sources/__init__.py | 2 +
ai_daily_report/sources/aihot.py | 32 +
ai_daily_report/sources/juya.py | 58 +
ai_daily_report/sources/labels.py | 78 ++
ai_daily_report/sources/registry.py | 24 +
ai_daily_report/sources/rss.py | 51 +
ai_daily_report/validate.py | 46 +
config/pipeline.json | 16 +
config/sources.json | 58 +
docs/pipeline-optimization-plan.md | 786 ++++++++++++
.../2026-06-04-local-dry-run-foundation.md | 159 +++
script/ai_daily_blog_pipeline.py | 1115 +----------------
script/blog_markdown.md | 198 ---
script/run_meta.json | 35 -
skill/scripts/.gitkeep | 1 +
skill/scripts/run_daily_report.py | 7 +
tests/fixtures/.gitkeep | 1 +
tests/test_cli.py | 47 +
tests/test_clients.py | 47 +
tests/test_config_loading.py | 27 +
tests/test_dry_run_config.py | 33 +
tests/test_env_config.py | 87 ++
tests/test_env_loading.py | 39 +
tests/test_legacy_script_delegation.py | 57 +
tests/test_llm_utils.py | 17 +
tests/test_markdown_rendering.py | 39 +
tests/test_project_structure.py | 33 +
tests/test_runner.py | 132 ++
tests/test_source_labels.py | 55 +
tests/test_stage0_collect.py | 49 +
tests/test_stage0_to_2_pipeline.py | 32 +
tests/test_stage0_to_4_pipeline.py | 66 +
tests/test_stage0_to_5_pipeline.py | 62 +
tests/test_stage0_to_6_pipeline.py | 75 ++
tests/test_stage0_to_7_pipeline.py | 76 ++
tests/test_stage0_to_8_pipeline.py | 79 ++
tests/test_stage1_normalize.py | 85 ++
tests/test_stage2_dedupe.py | 63 +
tests/test_stage3_semantic_dedupe.py | 129 ++
tests/test_stage4_rewrite.py | 96 ++
tests/test_stage5_classify.py | 61 +
tests/test_stage6_guide.py | 77 ++
tests/test_stage7_assemble.py | 65 +
tests/test_stage8_publish.py | 76 ++
tests/test_validate.py | 14 +
64 files changed, 4778 insertions(+), 1316 deletions(-)
create mode 100644 .gitignore
create mode 100644 ai_daily_report/__init__.py
create mode 100644 ai_daily_report/assemble.py
create mode 100644 ai_daily_report/classify.py
create mode 100644 ai_daily_report/cli.py
create mode 100644 ai_daily_report/clients.py
create mode 100644 ai_daily_report/collect.py
create mode 100644 ai_daily_report/config.py
create mode 100644 ai_daily_report/dedupe.py
create mode 100644 ai_daily_report/env.py
create mode 100644 ai_daily_report/guide.py
create mode 100644 ai_daily_report/llm.py
create mode 100644 ai_daily_report/models.py
create mode 100644 ai_daily_report/normalize.py
create mode 100644 ai_daily_report/pipeline.py
create mode 100644 ai_daily_report/publish.py
create mode 100644 ai_daily_report/rewrite.py
create mode 100644 ai_daily_report/runner.py
create mode 100644 ai_daily_report/semantic_dedupe.py
create mode 100644 ai_daily_report/sources/__init__.py
create mode 100644 ai_daily_report/sources/aihot.py
create mode 100644 ai_daily_report/sources/juya.py
create mode 100644 ai_daily_report/sources/labels.py
create mode 100644 ai_daily_report/sources/registry.py
create mode 100644 ai_daily_report/sources/rss.py
create mode 100644 ai_daily_report/validate.py
create mode 100644 config/pipeline.json
create mode 100644 config/sources.json
create mode 100644 docs/pipeline-optimization-plan.md
create mode 100644 docs/plans/2026-06-04-local-dry-run-foundation.md
delete mode 100644 script/blog_markdown.md
delete mode 100644 script/run_meta.json
create mode 100644 skill/scripts/.gitkeep
create mode 100644 skill/scripts/run_daily_report.py
create mode 100644 tests/fixtures/.gitkeep
create mode 100644 tests/test_cli.py
create mode 100644 tests/test_clients.py
create mode 100644 tests/test_config_loading.py
create mode 100644 tests/test_dry_run_config.py
create mode 100644 tests/test_env_config.py
create mode 100644 tests/test_env_loading.py
create mode 100644 tests/test_legacy_script_delegation.py
create mode 100644 tests/test_llm_utils.py
create mode 100644 tests/test_markdown_rendering.py
create mode 100644 tests/test_project_structure.py
create mode 100644 tests/test_runner.py
create mode 100644 tests/test_source_labels.py
create mode 100644 tests/test_stage0_collect.py
create mode 100644 tests/test_stage0_to_2_pipeline.py
create mode 100644 tests/test_stage0_to_4_pipeline.py
create mode 100644 tests/test_stage0_to_5_pipeline.py
create mode 100644 tests/test_stage0_to_6_pipeline.py
create mode 100644 tests/test_stage0_to_7_pipeline.py
create mode 100644 tests/test_stage0_to_8_pipeline.py
create mode 100644 tests/test_stage1_normalize.py
create mode 100644 tests/test_stage2_dedupe.py
create mode 100644 tests/test_stage3_semantic_dedupe.py
create mode 100644 tests/test_stage4_rewrite.py
create mode 100644 tests/test_stage5_classify.py
create mode 100644 tests/test_stage6_guide.py
create mode 100644 tests/test_stage7_assemble.py
create mode 100644 tests/test_stage8_publish.py
create mode 100644 tests/test_validate.py
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1283968
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+.env
+.env.*
+!.env.example
+__pycache__/
+*.py[cod]
+.pytest_cache/
+runs/
+runs-*/
+.idea/
diff --git a/ai_daily_report/__init__.py b/ai_daily_report/__init__.py
new file mode 100644
index 0000000..5f84311
--- /dev/null
+++ b/ai_daily_report/__init__.py
@@ -0,0 +1,2 @@
+"""Core package for the AI daily report pipeline."""
+
diff --git a/ai_daily_report/assemble.py b/ai_daily_report/assemble.py
new file mode 100644
index 0000000..b66e6ea
--- /dev/null
+++ b/ai_daily_report/assemble.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+import re
+from typing import Any
+
+from .classify import SECTION_ORDER
+from .models import NewsItem
+from .validate import validate_markdown
+
+
+END_PUNCTUATION = "。!?;.!?;"
+
+
+def _clean_text(text: str) -> str:
+ value = re.sub(r"^```(?:\w+)?\s*\n?", "", (text or "").strip())
+ value = re.sub(r"\n?```\s*$", "", value)
+ value = re.sub(r"^\s*>\s*", "", value)
+ value = re.sub(r"\[\d+\]|\[N\]", "", value)
+ value = re.sub(r"主线判断[::]\s*", "", value)
+ value = re.sub(r"\s+", " ", value).strip()
+ return value
+
+
+def _ensure_sentence(text: str) -> str:
+ value = _clean_text(text)
+ if value and value[-1] not in END_PUNCTUATION:
+ value += "。"
+ return value
+
+
+def _source_link(item: NewsItem) -> str:
+ source = item.source_label or item.source_group or "来源"
+ if item.url:
+ return f"[{source} ↗]({item.url})"
+ return source
+
+
+def assemble_markdown(items: list[NewsItem], guide: dict[str, Any] | None = None) -> tuple[str, dict[str, Any]]:
+ guide = guide or {"theme": "", "threads": []}
+ lines: list[str] = []
+
+ theme = _clean_text(str(guide.get("theme") or ""))
+ if theme:
+ lines.extend(["## 导览", "", f"> {theme}", ""])
+
+ item_number = 1
+ for section in SECTION_ORDER:
+ section_items = [item for item in items if item.section == section]
+ if not section_items:
+ continue
+ lines.extend([f"## {section}", ""])
+ for item in section_items:
+ title = _clean_text(item.title or item.title_raw)
+ summary = _ensure_sentence(item.summary or item.summary_raw or "该条目暂无摘要。")
+ lines.extend(
+ [
+ f"**{item_number}. {title}**",
+ "",
+ f"> {summary}{_source_link(item)}",
+ "",
+ ]
+ )
+ item_number += 1
+
+ threads = guide.get("threads", []) or []
+ if threads:
+ lines.extend(["## 今日脉络", ""])
+ for thread in threads:
+ title = _clean_text(str(thread.get("title") or ""))
+ text = _ensure_sentence(str(thread.get("text") or ""))
+ if not title or not text:
+ continue
+ lines.extend([f"- **{title}**", f" {text}", ""])
+
+ markdown = "\n".join(lines).strip()
+ report = validate_markdown(markdown, items)
+ return markdown, report
diff --git a/ai_daily_report/classify.py b/ai_daily_report/classify.py
new file mode 100644
index 0000000..4beca1f
--- /dev/null
+++ b/ai_daily_report/classify.py
@@ -0,0 +1,109 @@
+from __future__ import annotations
+
+from collections import Counter
+from typing import Any
+
+from .models import NewsItem
+
+
+SECTION_ORDER = [
+ "模型与能力",
+ "产品与应用",
+ "开发与基础设施",
+ "公司与资本",
+ "政策与安全",
+ "论文与研究",
+ "观点与教程",
+ "人物与动态",
+]
+
+SECTION_ALIASES = {
+ "模型发布/更新": "模型与能力",
+ "产品发布/更新": "产品与应用",
+ "产品与工具": "产品与应用",
+ "开发与工程": "开发与基础设施",
+ "行业动态": "公司与资本",
+ "行业与公司": "公司与资本",
+ "论文研究": "论文与研究",
+ "论文与研究": "论文与研究",
+ "技巧与观点": "观点与教程",
+ "观点与教程": "观点与教程",
+ "人物与花絮": "人物与动态",
+}
+
+
+RULES = [
+ ("政策与安全", ("监管", "政策", "安全", "风险", "滥用", "攻击", "合规", "版权")),
+ ("论文与研究", ("论文", "研究", "arxiv", "cvpr", "benchmark", "评测", "实验")),
+ ("开发与基础设施", ("sdk", "api", "mcp", "kubernetes", "框架", "开源", "github", "部署", "基础设施")),
+ ("公司与资本", ("融资", "ipo", "上市", "招股书", "合作", "估值", "收购", "资本")),
+ ("模型与能力", ("模型", "gpt", "claude", "gemini", "grok", "token", "参数", "多模态", "语音", "推理")),
+ ("产品与应用", ("agent", "应用", "产品", "平台", "上线", "工具", "智能体")),
+ ("观点与教程", ("教程", "观点", "方法论", "guide", "实践", "技巧")),
+ ("人物与动态", ("黄仁勋", "纳德拉", "访谈", "演讲", "人物")),
+]
+
+
+def normalize_section_hint(section_hint: str) -> str:
+ hint = (section_hint or "").strip()
+ if hint in SECTION_ORDER:
+ return hint
+ return SECTION_ALIASES.get(hint, "")
+
+
+def rule_classify(item: NewsItem) -> str:
+ text = f"{item.title or item.title_raw} {item.summary or item.summary_raw}".lower()
+ for section, keywords in RULES:
+ if any(keyword.lower() in text for keyword in keywords):
+ return section
+ return "公司与资本"
+
+
+def rank_score(item: NewsItem) -> int:
+ text = f"{item.title or item.title_raw} {item.summary or item.summary_raw}"
+ score = max(0, 200 - item.source_priority)
+ if item.source_role == "primary":
+ score += 10
+ if item.canonical_url:
+ score += 10
+ if any(ch.isdigit() for ch in text):
+ score += 10
+ if item.duplicate_sources:
+ score += min(20, len(item.duplicate_sources) * 5)
+ score -= len(item.quality_flags) * 10
+ return score
+
+
+def classify_and_order_items(items: list[NewsItem]) -> tuple[list[NewsItem], dict[str, Any]]:
+ hint_classified = 0
+ rule_classified = 0
+
+ for item in items:
+ mapped = normalize_section_hint(item.section_hint)
+ if mapped:
+ item.section = mapped
+ hint_classified += 1
+ else:
+ item.section = rule_classify(item)
+ rule_classified += 1
+
+ section_index = {section: index for index, section in enumerate(SECTION_ORDER)}
+ ordered = sorted(
+ items,
+ key=lambda item: (
+ section_index.get(item.section or "", len(SECTION_ORDER)),
+ -rank_score(item),
+ item.title or item.title_raw,
+ ),
+ )
+ section_counts = Counter(item.section for item in ordered if item.section)
+ report = {
+ "input_count": len(items),
+ "section_counts": dict(section_counts),
+ "hint_classified": hint_classified,
+ "rule_classified": rule_classified,
+ "llm_classified": 0,
+ "fallback_classified": 0,
+ "invalid_section_count": sum(1 for item in ordered if item.section not in SECTION_ORDER),
+ }
+ return ordered, report
diff --git a/ai_daily_report/cli.py b/ai_daily_report/cli.py
new file mode 100644
index 0000000..539cbce
--- /dev/null
+++ b/ai_daily_report/cli.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+from .runner import run_daily_report
+
+
+def build_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(prog="ai-daily-report")
+ subcommands = parser.add_subparsers(dest="command")
+ run = subcommands.add_parser("run")
+ run.add_argument("--date", default="today")
+ run.add_argument("--mode", choices=["dry-run", "draft", "publish"], default="dry-run")
+ run.add_argument("--source-mode", choices=["mock", "live"], default="mock")
+ run.add_argument("--llm-mode", choices=["mock", "live"], default="mock")
+ run.add_argument("--out-dir", default="runs")
+ run.add_argument("--base-url", default="https://blog.ephron.ren")
+ run.add_argument("--sources-path", default=None)
+ return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+ parser = build_parser()
+ args = parser.parse_args(argv)
+ if args.command == "run":
+ run_daily_report(
+ run_date=args.date,
+ mode=args.mode,
+ source_mode=args.source_mode,
+ llm_mode=args.llm_mode,
+ out_dir=Path(args.out_dir),
+ base_url=args.base_url,
+ sources_path=Path(args.sources_path) if args.sources_path else None,
+ )
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/ai_daily_report/clients.py b/ai_daily_report/clients.py
new file mode 100644
index 0000000..2fd3359
--- /dev/null
+++ b/ai_daily_report/clients.py
@@ -0,0 +1,64 @@
+from __future__ import annotations
+
+import json
+import urllib.request
+from typing import Any
+
+
+UA = "Mozilla/5.0 (compatible; ai-daily-report/1.0)"
+
+
+def fetch_text(url: str, timeout_seconds: int) -> str:
+ req = urllib.request.Request(url, headers={"User-Agent": UA})
+ with urllib.request.urlopen(req, timeout=timeout_seconds) as response:
+ return response.read().decode("utf-8", "ignore")
+
+
+class OpenAICompatibleClient:
+ def __init__(self, *, api_key: str, base_url: str, model: str, timeout_seconds: int = 600):
+ self.api_key = api_key
+ self.base_url = base_url.rstrip("/")
+ self.model = model
+ self.timeout_seconds = timeout_seconds
+
+ def chat(self, prompt: str) -> str:
+ payload = json.dumps(
+ {
+ "model": self.model,
+ "messages": [{"role": "user", "content": prompt}],
+ "temperature": 0.2,
+ "max_tokens": 8000,
+ },
+ ensure_ascii=False,
+ ).encode("utf-8")
+ req = urllib.request.Request(
+ f"{self.base_url}/chat/completions",
+ data=payload,
+ headers={"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"},
+ )
+ with urllib.request.urlopen(req, timeout=self.timeout_seconds) as response:
+ data = json.loads(response.read().decode("utf-8"))
+ return data["choices"][0]["message"]["content"].strip()
+
+
+class BlogApiClient:
+ def __init__(self, *, base_url: str, token: str, timeout_seconds: int = 25):
+ self.base_url = base_url.rstrip("/")
+ self.token = token
+ self.timeout_seconds = timeout_seconds
+
+ def _request(self, method: str, path: str, payload: dict[str, Any] | None = None) -> dict[str, Any]:
+ data = None
+ headers = {"Authorization": f"Bearer {self.token}", "User-Agent": UA}
+ if payload is not None:
+ data = json.dumps(payload, ensure_ascii=False).encode("utf-8")
+ headers["Content-Type"] = "application/json"
+ req = urllib.request.Request(f"{self.base_url}{path}", data=data, headers=headers, method=method)
+ with urllib.request.urlopen(req, timeout=self.timeout_seconds) as response:
+ return json.loads(response.read().decode("utf-8"))
+
+ def create_post(self, payload: dict[str, Any]) -> dict[str, Any]:
+ return self._request("POST", "/api/service/posts", payload)
+
+ def publish_post(self, slug: str) -> None:
+ self._request("POST", f"/api/service/posts/{slug}/publish")
diff --git a/ai_daily_report/collect.py b/ai_daily_report/collect.py
new file mode 100644
index 0000000..b1c947e
--- /dev/null
+++ b/ai_daily_report/collect.py
@@ -0,0 +1,95 @@
+from __future__ import annotations
+
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime, timezone
+from time import perf_counter
+from typing import Callable, Iterable, Any
+
+from .models import SourceConfig, SourceResult
+
+
+Fetcher = Callable[[SourceConfig, str], list[dict[str, Any]]]
+
+
+def _status_from_exception(exc: Exception) -> str:
+ if isinstance(exc, TimeoutError):
+ return "timeout"
+ return "error"
+
+
+def _collect_one(config: SourceConfig, run_date: str, fetcher: Fetcher) -> SourceResult:
+ fetched_at = datetime.now(timezone.utc).isoformat()
+ if not config.enabled:
+ return SourceResult(
+ source=config.name,
+ role=config.role,
+ ok=False,
+ status="disabled",
+ fetched_at=fetched_at,
+ )
+
+ started = perf_counter()
+ try:
+ items = fetcher(config, run_date)
+ elapsed_ms = int((perf_counter() - started) * 1000)
+ status = "ok" if items else "empty"
+ return SourceResult(
+ source=config.name,
+ role=config.role,
+ ok=status == "ok",
+ status=status,
+ items=items,
+ elapsed_ms=elapsed_ms,
+ fetched_at=fetched_at,
+ )
+ except Exception as exc:
+ elapsed_ms = int((perf_counter() - started) * 1000)
+ return SourceResult(
+ source=config.name,
+ role=config.role,
+ ok=False,
+ status=_status_from_exception(exc),
+ error=f"{type(exc).__name__}: {exc}",
+ elapsed_ms=elapsed_ms,
+ fetched_at=fetched_at,
+ )
+
+
+def collect_sources(
+ configs: Iterable[SourceConfig],
+ run_date: str,
+ *,
+ fetcher: Fetcher,
+ max_workers: int | None = None,
+) -> tuple[list[SourceResult], dict[str, Any]]:
+ ordered_configs = list(configs)
+ if not ordered_configs:
+ return [], {
+ "input_source_count": 0,
+ "ok_source_count": 0,
+ "failed_source_count": 0,
+ "raw_item_count": 0,
+ }
+
+ workers = max_workers or min(8, len(ordered_configs))
+ result_by_name: dict[str, SourceResult] = {}
+
+ with ThreadPoolExecutor(max_workers=workers) as executor:
+ futures = {
+ executor.submit(_collect_one, config, run_date, fetcher): config
+ for config in ordered_configs
+ }
+ for future in as_completed(futures):
+ config = futures[future]
+ result_by_name[config.name] = future.result()
+
+ results = [result_by_name[config.name] for config in ordered_configs]
+ report = {
+ "input_source_count": len(results),
+ "ok_source_count": sum(1 for result in results if result.ok),
+ "failed_source_count": sum(1 for result in results if not result.ok),
+ "raw_item_count": sum(len(result.items) for result in results),
+ "source_counts": {result.source: len(result.items) for result in results},
+ "statuses": {result.source: result.status for result in results},
+ }
+ return results, report
diff --git a/ai_daily_report/config.py b/ai_daily_report/config.py
new file mode 100644
index 0000000..03b426d
--- /dev/null
+++ b/ai_daily_report/config.py
@@ -0,0 +1,19 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+from .models import SourceConfig
+from .pipeline import _source_config_from_dict
+
+
+def load_json(path: Path) -> Any:
+ return json.loads(path.read_text(encoding="utf-8"))
+
+
+def load_source_configs(path: Path) -> list[SourceConfig]:
+ raw = load_json(path)
+ if not isinstance(raw, list):
+ raise ValueError("sources config must be a list")
+ return [_source_config_from_dict(item) for item in raw]
diff --git a/ai_daily_report/dedupe.py b/ai_daily_report/dedupe.py
new file mode 100644
index 0000000..6a9e426
--- /dev/null
+++ b/ai_daily_report/dedupe.py
@@ -0,0 +1,100 @@
+from __future__ import annotations
+
+import difflib
+from typing import Any
+
+from .models import NewsItem
+
+
+def _item_score(item: NewsItem) -> int:
+ score = 0
+ score += max(0, 200 - item.source_priority)
+ if item.canonical_url:
+ score += 20
+ if item.summary_raw:
+ score += min(40, len(item.summary_raw))
+ if item.section_hint:
+ score += 10
+ if item.source_role == "primary":
+ score += 10
+ score -= len(item.quality_flags) * 10
+ return score
+
+
+def _merge_group(group: list[NewsItem], reason: str) -> tuple[NewsItem, list[NewsItem], dict[str, Any]]:
+ keep = max(group, key=_item_score)
+ removed = [item for item in group if item is not keep]
+ for removed_item in removed:
+ keep.duplicate_sources.append(
+ {
+ "id": removed_item.id,
+ "source_group": removed_item.source_group,
+ "source_label": removed_item.source_label,
+ "url": removed_item.url,
+ "reason": reason,
+ }
+ )
+ report_group = {
+ "reason": reason,
+ "keep_id": keep.id,
+ "removed_ids": [item.id for item in removed],
+ "confidence": "high",
+ }
+ return keep, removed, report_group
+
+
+def _group_by_key(items: list[NewsItem], key_name: str) -> dict[str, list[NewsItem]]:
+ groups: dict[str, list[NewsItem]] = {}
+ for item in items:
+ key = getattr(item, key_name)
+ if key:
+ groups.setdefault(key, []).append(item)
+ return {key: group for key, group in groups.items() if len(group) > 1}
+
+
+def _possible_duplicates(items: list[NewsItem]) -> list[dict[str, Any]]:
+ possible: list[dict[str, Any]] = []
+ for index, left in enumerate(items):
+ for right in items[index + 1 :]:
+ if not left.title_norm or not right.title_norm:
+ continue
+ ratio = difflib.SequenceMatcher(None, left.title_norm, right.title_norm).ratio()
+ if ratio >= 0.65:
+ possible.append(
+ {
+ "item_ids": [left.id, right.id],
+ "reason": "title_similarity",
+ "similarity": round(ratio, 3),
+ "confidence": "medium",
+ }
+ )
+ return possible
+
+
+def hard_dedup_items(items: list[NewsItem]) -> tuple[list[NewsItem], dict[str, Any]]:
+ remaining = list(items)
+ removed_object_ids: set[int] = set()
+ groups_report: list[dict[str, Any]] = []
+
+ for key_name, reason in (
+ ("canonical_url", "same_canonical_url"),
+ ("title_norm", "same_title_norm"),
+ ):
+ grouped = _group_by_key([item for item in remaining if id(item) not in removed_object_ids], key_name)
+ for group in grouped.values():
+ active_group = [item for item in group if id(item) not in removed_object_ids]
+ if len(active_group) < 2:
+ continue
+ keep, removed, report_group = _merge_group(active_group, reason)
+ removed_object_ids.update(id(item) for item in removed)
+ groups_report.append(report_group)
+
+ deduped = [item for item in remaining if id(item) not in removed_object_ids]
+ report = {
+ "input_count": len(items),
+ "output_count": len(deduped),
+ "removed_count": len(removed_object_ids),
+ "groups": groups_report,
+ "possible_duplicates": _possible_duplicates(deduped),
+ }
+ return deduped, report
diff --git a/ai_daily_report/env.py b/ai_daily_report/env.py
new file mode 100644
index 0000000..a5697f0
--- /dev/null
+++ b/ai_daily_report/env.py
@@ -0,0 +1,143 @@
+from __future__ import annotations
+
+import os
+import json
+from pathlib import Path
+
+
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+
+
+def read_env_file(env_path: Path) -> dict[str, str]:
+ env: dict[str, str] = {}
+ if not env_path.exists():
+ return env
+ text = env_path.read_text(encoding="utf-8", errors="ignore")
+ for line in text.splitlines():
+ line = line.strip()
+ if not line or line.startswith("#") or "=" not in line:
+ continue
+ key, value = line.split("=", 1)
+ env[key.strip()] = value.strip().strip('"').strip("'")
+ return env
+
+
+def load_env() -> dict[str, str]:
+ env: dict[str, str] = {}
+ env.update(read_env_file(PROJECT_ROOT / ".env"))
+ env.update(read_env_file(Path.home() / ".hermes" / ".env"))
+ env.update({key: value for key, value in os.environ.items() if value})
+ return env
+
+
+def first_env(env: dict[str, str], *names: str) -> str:
+ for name in names:
+ value = (env.get(name) or "").strip()
+ if value:
+ return value
+ return ""
+
+
+def _load_simple_yaml(path: Path) -> dict[str, object]:
+ if not path.exists():
+ return {}
+ root: dict[str, object] = {}
+ stack: list[tuple[int, dict[str, object]]] = [(-1, root)]
+ for raw_line in path.read_text(encoding="utf-8", errors="ignore").splitlines():
+ if not raw_line.strip() or raw_line.lstrip().startswith("#") or ":" not in raw_line:
+ continue
+ indent = len(raw_line) - len(raw_line.lstrip(" "))
+ key, value = raw_line.strip().split(":", 1)
+ key = key.strip()
+ value = value.strip().strip('"').strip("'")
+ while stack and indent <= stack[-1][0]:
+ stack.pop()
+ current = stack[-1][1]
+ if value:
+ current[key] = value
+ else:
+ child: dict[str, object] = {}
+ current[key] = child
+ stack.append((indent, child))
+ return root
+
+
+def _env_with_hermes(env: dict[str, str], hermes_dir: Path) -> dict[str, str]:
+ merged = dict(read_env_file(hermes_dir / ".env"))
+ merged.update(env)
+ return merged
+
+
+def _provider_env_names(provider: str) -> tuple[str, str, str]:
+ prefix = provider.upper().replace("-", "_")
+ return f"{prefix}_API_KEY", f"{prefix}_BASE_URL", f"{prefix}_MODEL"
+
+
+def _auth_json_key(env: dict[str, str], hermes_dir: Path, provider: str) -> str:
+ auth_path = hermes_dir / "auth.json"
+ if not auth_path.exists() or not provider:
+ return ""
+ try:
+ auth = json.loads(auth_path.read_text(encoding="utf-8"))
+ except Exception:
+ return ""
+ pool = auth.get("credential_pool", {}) or {}
+ provider_keys = [provider, provider.replace("-", "_")]
+ for key in provider_keys:
+ creds = pool.get(key, []) or []
+ if not creds:
+ continue
+ cred = creds[0]
+ source = str(cred.get("source") or "")
+ if source.startswith("env:"):
+ resolved = first_env(env, source[4:])
+ if resolved:
+ return resolved
+ token = str(cred.get("access_token") or "").strip()
+ if token:
+ return token
+ return ""
+
+
+def resolve_llm_config(env: dict[str, str], *, hermes_dir: Path | None = None) -> dict[str, str]:
+ hermes_dir = hermes_dir or Path.home() / ".hermes"
+ env = _env_with_hermes(env, hermes_dir)
+ hermes_config = _load_simple_yaml(hermes_dir / "config.yaml")
+ model_config = hermes_config.get("model", {}) if isinstance(hermes_config.get("model"), dict) else {}
+ provider = str(model_config.get("provider") or "").strip()
+ provider_key, provider_base_url, provider_model = _provider_env_names(provider) if provider else ("", "", "")
+
+ api_key = first_env(env, "LLM_API_KEY")
+ base_url = first_env(env, "LLM_BASE_URL")
+ model = first_env(env, "LLM_MODEL")
+
+ if not api_key and provider:
+ api_key = first_env(env, provider_key) or _auth_json_key(env, hermes_dir, provider)
+ if not base_url and provider:
+ base_url = first_env(env, provider_base_url) or str(model_config.get("base_url") or "").strip()
+ if not model and provider:
+ model = first_env(env, provider_model) or str(model_config.get("default") or "").strip()
+
+ if not api_key:
+ api_key = first_env(env, "SUB2API_API_KEY", "XIAOMI_API_KEY", "OPENROUTER_API_KEY")
+ if not base_url:
+ base_url = first_env(env, "SUB2API_BASE_URL", "XIAOMI_BASE_URL", "OPENROUTER_BASE_URL")
+ if not model:
+ model = first_env(env, "SUB2API_MODEL", "XIAOMI_MODEL")
+
+ missing = [
+ name
+ for name, value in (
+ ("LLM_API_KEY", api_key),
+ ("LLM_BASE_URL", base_url),
+ ("LLM_MODEL", model),
+ )
+ if not value
+ ]
+ if missing:
+ raise ValueError("missing_llm_config: " + ",".join(missing))
+ return {"api_key": api_key, "base_url": base_url, "model": model}
+
+
+def resolve_blog_token(env: dict[str, str]) -> str:
+ return first_env(env, "BLOG_SERVICE_TOKEN", "EPHRON_SERVICE_TOKEN")
diff --git a/ai_daily_report/guide.py b/ai_daily_report/guide.py
new file mode 100644
index 0000000..63d8b89
--- /dev/null
+++ b/ai_daily_report/guide.py
@@ -0,0 +1,113 @@
+from __future__ import annotations
+
+import json
+import re
+from typing import Any, Callable
+
+from .llm import parse_json_object
+from .models import NewsItem
+
+
+GuideLlmCall = Callable[[str], str]
+
+
+def _clean_text(text: str, limit: int | None = None) -> str:
+ value = re.sub(r"^\s*>\s*", "", text or "").strip()
+ value = re.sub(r"\[\d+\]|\[N\]", "", value)
+ value = re.sub(r"\s+", " ", value).strip()
+ if limit and len(value) > limit:
+ value = value[:limit].rstrip()
+ return value
+
+
+def _build_prompt(items: list[NewsItem]) -> str:
+ payload = {
+ "task": (
+ "Generate a concise AI daily report guide. Return JSON only. Do not use 强信号/中信号/待验证. "
+ "Use a short theme and 2-4 daily threads. Every thread must reference existing item_ids."
+ ),
+ "items": [
+ {
+ "id": item.id,
+ "title": item.title or item.title_raw,
+ "summary": item.summary or item.summary_raw,
+ "section": item.section,
+ "source": item.source_label,
+ }
+ for item in items
+ ],
+ "output_schema": {
+ "theme": "one sentence under 120 Chinese characters",
+ "threads": [
+ {
+ "title": "thread title",
+ "text": "one or two sentences",
+ "item_ids": ["existing item id"],
+ "kind": "thread|uncertain",
+ }
+ ],
+ },
+ }
+ return json.dumps(payload, ensure_ascii=False)
+
+
+def generate_guide(
+ items: list[NewsItem],
+ *,
+ llm_call: GuideLlmCall,
+) -> tuple[dict[str, Any], dict[str, Any]]:
+ if not items:
+ return {
+ "theme": "",
+ "threads": [],
+ }, {
+ "input_count": 0,
+ "theme_present": False,
+ "thread_count": 0,
+ "dropped_thread_count": 0,
+ "fallback_used": False,
+ "errors": [],
+ }
+
+ try:
+ obj = parse_json_object(llm_call(_build_prompt(items)))
+ except Exception as exc:
+ return {
+ "theme": "",
+ "threads": [],
+ }, {
+ "input_count": len(items),
+ "theme_present": False,
+ "thread_count": 0,
+ "dropped_thread_count": 0,
+ "fallback_used": True,
+ "errors": [f"{type(exc).__name__}: {exc}"],
+ }
+
+ valid_ids = {item.id for item in items}
+ threads: list[dict[str, Any]] = []
+ dropped = 0
+ for thread in obj.get("threads", []) or []:
+ item_ids = [item_id for item_id in thread.get("item_ids", []) if item_id in valid_ids]
+ if not item_ids:
+ dropped += 1
+ continue
+ title = _clean_text(str(thread.get("title") or ""), limit=80)
+ text = _clean_text(str(thread.get("text") or ""), limit=220)
+ if not title or not text:
+ dropped += 1
+ continue
+ kind = thread.get("kind") if thread.get("kind") in ("thread", "uncertain") else "thread"
+ threads.append({"title": title, "text": text, "item_ids": item_ids, "kind": kind})
+
+ theme = _clean_text(str(obj.get("theme") or ""), limit=120)
+ guide = {"theme": theme, "threads": threads}
+ report = {
+ "input_count": len(items),
+ "theme_present": bool(theme),
+ "thread_count": len(threads),
+ "dropped_thread_count": dropped,
+ "fallback_used": False,
+ "errors": [],
+ }
+ return guide, report
diff --git a/ai_daily_report/llm.py b/ai_daily_report/llm.py
new file mode 100644
index 0000000..33c8769
--- /dev/null
+++ b/ai_daily_report/llm.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+import json
+import re
+from typing import Any, Callable
+
+
+LlmCall = Callable[[str], str]
+
+
+def parse_json_object(text: str) -> dict[str, Any]:
+ text = re.sub(r"^```(?:json)?\s*\n?", "", text.strip())
+ text = re.sub(r"\n?```\s*$", "", text)
+ match = re.search(r"\{.*\}\s*$", text, re.S)
+ if not match:
+ raise ValueError("LLM output does not contain a JSON object")
+ return json.loads(match.group(0))
+
diff --git a/ai_daily_report/models.py b/ai_daily_report/models.py
new file mode 100644
index 0000000..756b629
--- /dev/null
+++ b/ai_daily_report/models.py
@@ -0,0 +1,53 @@
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass(frozen=True)
+class SourceConfig:
+ name: str
+ type: str
+ role: str = "supplement"
+ priority: int = 100
+ required: bool = False
+ enabled: bool = True
+ timeout_seconds: int = 25
+ retries: int = 0
+ min_items: int = 0
+ url: str = ""
+
+
+@dataclass
+class SourceResult:
+ source: str
+ role: str
+ ok: bool
+ status: str
+ items: list[dict[str, Any]] = field(default_factory=list)
+ error: str | None = None
+ elapsed_ms: int = 0
+ retry_count: int = 0
+ fetched_at: str = ""
+
+
+@dataclass
+class NewsItem:
+ id: str
+ source_group: str
+ source_label: str
+ source_role: str
+ source_priority: int
+ title_raw: str
+ title_norm: str
+ summary_raw: str
+ url: str
+ canonical_url: str
+ published_at: str | None = None
+ collected_at: str = ""
+ origin_type: str = ""
+ section_hint: str = ""
+ language_hint: str = ""
+ title: str | None = None
+ summary: str | None = None
+ section: str | None = None
+ quality_flags: list[str] = field(default_factory=list)
+ duplicate_sources: list[dict[str, Any]] = field(default_factory=list)
diff --git a/ai_daily_report/normalize.py b/ai_daily_report/normalize.py
new file mode 100644
index 0000000..dda9dd5
--- /dev/null
+++ b/ai_daily_report/normalize.py
@@ -0,0 +1,132 @@
+from __future__ import annotations
+
+import hashlib
+import html
+import re
+import unicodedata
+from collections import Counter
+from datetime import datetime, timezone
+from typing import Any
+from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
+
+from .models import NewsItem, SourceResult
+
+
+TRACKING_QUERY_PREFIXES = ("utm_",)
+TRACKING_QUERY_KEYS = {"fbclid", "gclid", "spm", "from", "ref"}
+
+
+def clean_text(value: str) -> str:
+ text = html.unescape(value or "")
+ text = re.sub(r"<[^>]+>", " ", text)
+ text = re.sub(r"\s+", " ", text).strip()
+ return text
+
+
+def canonicalize_url(url: str) -> str:
+ if not url:
+ return ""
+ parsed = urlparse(url.strip())
+ scheme = (parsed.scheme or "https").lower()
+ host = (parsed.netloc or "").lower()
+ if host.startswith("www."):
+ host = host[4:]
+ if host == "twitter.com":
+ host = "x.com"
+
+ query = []
+ for key, value in parse_qsl(parsed.query, keep_blank_values=True):
+ key_lower = key.lower()
+ if key_lower in TRACKING_QUERY_KEYS:
+ continue
+ if any(key_lower.startswith(prefix) for prefix in TRACKING_QUERY_PREFIXES):
+ continue
+ query.append((key, value))
+
+ path = parsed.path or ""
+ if len(path) > 1:
+ path = path.rstrip("/")
+
+ return urlunparse((scheme, host, path, "", urlencode(query), ""))
+
+
+def normalize_title(title: str) -> str:
+ text = unicodedata.normalize("NFKC", title or "").lower()
+ text = re.sub(r"[^\w\u4e00-\u9fff]+", "", text)
+ return text
+
+
+def _item_id(canonical_url: str, source_group: str, title_norm: str, published_at: str | None) -> str:
+ seed = canonical_url or "|".join([source_group, title_norm, published_at or ""])
+ digest = hashlib.sha1(seed.encode("utf-8")).hexdigest()[:16]
+ return f"item_{digest}"
+
+
+def _quality_flags(title: str, summary: str, url: str) -> list[str]:
+ flags: list[str] = []
+ if not url:
+ flags.append("missing_url")
+ if not summary:
+ flags.append("missing_summary")
+ if len(normalize_title(title)) < 3:
+ flags.append("short_title")
+ return flags
+
+
+def normalize_items(
+ source_results: list[SourceResult],
+ *,
+ run_date: str,
+ source_priorities: dict[str, int] | None = None,
+) -> tuple[list[NewsItem], dict[str, Any]]:
+ source_priorities = source_priorities or {}
+ collected_at = datetime.now(timezone.utc).isoformat()
+ items: list[NewsItem] = []
+ flag_counts: Counter[str] = Counter()
+ id_counts: Counter[str] = Counter()
+ input_count = 0
+
+ for source_result in source_results:
+ for raw in source_result.items:
+ input_count += 1
+ title = clean_text(str(raw.get("title_raw") or raw.get("title") or ""))
+ summary = clean_text(str(raw.get("summary_raw") or raw.get("summary") or ""))
+ url = str(raw.get("url") or "").strip()
+ canonical_url = canonicalize_url(url)
+ title_norm = normalize_title(title)
+ flags = _quality_flags(title, summary, canonical_url)
+ flag_counts.update(flags)
+ source_label = clean_text(str(raw.get("source_label") or source_result.source))
+ published_at = raw.get("published_at")
+ base_id = _item_id(canonical_url, source_result.source, title_norm, published_at)
+ id_counts[base_id] += 1
+ item_id = base_id if id_counts[base_id] == 1 else f"{base_id}_{id_counts[base_id]}"
+
+ items.append(
+ NewsItem(
+ id=item_id,
+ source_group=source_result.source,
+ source_label=source_label,
+ source_role=source_result.role,
+ source_priority=source_priorities.get(source_result.source, 100),
+ title_raw=title,
+ title_norm=title_norm,
+ summary_raw=summary,
+ url=url,
+ canonical_url=canonical_url,
+ published_at=published_at,
+ collected_at=collected_at,
+ origin_type=str(raw.get("origin_type") or ""),
+ section_hint=str(raw.get("section_hint") or ""),
+ language_hint=str(raw.get("language_hint") or ""),
+ quality_flags=flags,
+ )
+ )
+
+ report = {
+ "run_date": run_date,
+ "input_count": input_count,
+ "output_count": len(items),
+ "quality_flag_counts": dict(flag_counts),
+ }
+ return items, report
diff --git a/ai_daily_report/pipeline.py b/ai_daily_report/pipeline.py
new file mode 100644
index 0000000..e2bc8a9
--- /dev/null
+++ b/ai_daily_report/pipeline.py
@@ -0,0 +1,219 @@
+from __future__ import annotations
+
+from typing import Any
+
+from .assemble import assemble_markdown
+from .classify import classify_and_order_items
+from .collect import Fetcher, collect_sources
+from .dedupe import hard_dedup_items
+from .guide import GuideLlmCall, generate_guide
+from .models import SourceConfig
+from .normalize import normalize_items
+from .publish import BlogClient, publish_markdown
+from .rewrite import RewriteLlmCall, rewrite_items
+from .semantic_dedupe import SemanticLlmCall, semantic_dedup_items
+
+
+def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig:
+ return SourceConfig(
+ name=value["name"],
+ type=value["type"],
+ role=value.get("role", "supplement"),
+ priority=int(value.get("priority", 100)),
+ required=bool(value.get("required", False)),
+ enabled=bool(value.get("enabled", True)),
+ timeout_seconds=int(value.get("timeout_seconds", 25)),
+ retries=int(value.get("retries", 0)),
+ min_items=int(value.get("min_items", 0)),
+ url=value.get("url", ""),
+ )
+
+
+def run_stage0_to_stage2(
+ source_configs: list[dict[str, Any] | SourceConfig],
+ run_date: str,
+ *,
+ fetcher: Fetcher,
+) -> dict[str, Any]:
+ configs = [
+ config if isinstance(config, SourceConfig) else _source_config_from_dict(config)
+ for config in source_configs
+ ]
+ source_results, stage0_report = collect_sources(configs, run_date, fetcher=fetcher)
+ source_priorities = {config.name: config.priority for config in configs}
+ normalized_items, stage1_report = normalize_items(
+ source_results,
+ run_date=run_date,
+ source_priorities=source_priorities,
+ )
+ deduped_items, stage2_report = hard_dedup_items(normalized_items)
+ return {
+ "source_results": source_results,
+ "items": deduped_items,
+ "reports": {
+ "stage0": stage0_report,
+ "stage1": stage1_report,
+ "stage2": stage2_report,
+ },
+ }
+
+
+def run_stage0_to_stage4(
+ source_configs: list[dict[str, Any] | SourceConfig],
+ run_date: str,
+ *,
+ fetcher: Fetcher,
+ semantic_llm_call: SemanticLlmCall,
+ rewrite_llm_call: RewriteLlmCall,
+) -> dict[str, Any]:
+ stage2_result = run_stage0_to_stage2(source_configs, run_date, fetcher=fetcher)
+ items = stage2_result["items"]
+ candidates = stage2_result["reports"]["stage2"].get("possible_duplicates", [])
+ semantic_items, stage3_report = semantic_dedup_items(
+ items,
+ candidates,
+ llm_call=semantic_llm_call,
+ )
+ rewritten_items, stage4_report = rewrite_items(
+ semantic_items,
+ llm_call=rewrite_llm_call,
+ )
+ reports = dict(stage2_result["reports"])
+ reports["stage3"] = stage3_report
+ reports["stage4"] = stage4_report
+ return {
+ "source_results": stage2_result["source_results"],
+ "items": rewritten_items,
+ "reports": reports,
+ }
+
+
+def run_stage0_to_stage5(
+ source_configs: list[dict[str, Any] | SourceConfig],
+ run_date: str,
+ *,
+ fetcher: Fetcher,
+ semantic_llm_call: SemanticLlmCall,
+ rewrite_llm_call: RewriteLlmCall,
+) -> dict[str, Any]:
+ stage4_result = run_stage0_to_stage4(
+ source_configs,
+ run_date,
+ fetcher=fetcher,
+ semantic_llm_call=semantic_llm_call,
+ rewrite_llm_call=rewrite_llm_call,
+ )
+ classified_items, stage5_report = classify_and_order_items(stage4_result["items"])
+ reports = dict(stage4_result["reports"])
+ reports["stage5"] = stage5_report
+ return {
+ "source_results": stage4_result["source_results"],
+ "items": classified_items,
+ "reports": reports,
+ }
+
+
+def run_stage0_to_stage6(
+ source_configs: list[dict[str, Any] | SourceConfig],
+ run_date: str,
+ *,
+ fetcher: Fetcher,
+ semantic_llm_call: SemanticLlmCall,
+ rewrite_llm_call: RewriteLlmCall,
+ guide_llm_call: GuideLlmCall,
+) -> dict[str, Any]:
+ stage5_result = run_stage0_to_stage5(
+ source_configs,
+ run_date,
+ fetcher=fetcher,
+ semantic_llm_call=semantic_llm_call,
+ rewrite_llm_call=rewrite_llm_call,
+ )
+ guide, stage6_report = generate_guide(stage5_result["items"], llm_call=guide_llm_call)
+ reports = dict(stage5_result["reports"])
+ reports["stage6"] = stage6_report
+ return {
+ "source_results": stage5_result["source_results"],
+ "items": stage5_result["items"],
+ "guide": guide,
+ "reports": reports,
+ }
+
+
+def run_stage0_to_stage7(
+ source_configs: list[dict[str, Any] | SourceConfig],
+ run_date: str,
+ *,
+ fetcher: Fetcher,
+ semantic_llm_call: SemanticLlmCall,
+ rewrite_llm_call: RewriteLlmCall,
+ guide_llm_call: GuideLlmCall,
+) -> dict[str, Any]:
+ stage6_result = run_stage0_to_stage6(
+ source_configs,
+ run_date,
+ fetcher=fetcher,
+ semantic_llm_call=semantic_llm_call,
+ rewrite_llm_call=rewrite_llm_call,
+ guide_llm_call=guide_llm_call,
+ )
+ markdown, stage7_report = assemble_markdown(stage6_result["items"], stage6_result["guide"])
+ reports = dict(stage6_result["reports"])
+ reports["stage7"] = stage7_report
+ return {
+ "source_results": stage6_result["source_results"],
+ "items": stage6_result["items"],
+ "guide": stage6_result["guide"],
+ "markdown": markdown,
+ "reports": reports,
+ }
+
+
+def run_stage0_to_stage8(
+ source_configs: list[dict[str, Any] | SourceConfig],
+ run_date: str,
+ *,
+ fetcher: Fetcher,
+ semantic_llm_call: SemanticLlmCall,
+ rewrite_llm_call: RewriteLlmCall,
+ guide_llm_call: GuideLlmCall,
+ mode: str,
+ base_url: str,
+ client: BlogClient | None,
+) -> dict[str, Any]:
+ stage7_result = run_stage0_to_stage7(
+ source_configs,
+ run_date,
+ fetcher=fetcher,
+ semantic_llm_call=semantic_llm_call,
+ rewrite_llm_call=rewrite_llm_call,
+ guide_llm_call=guide_llm_call,
+ )
+ slug = f"ai-{run_date}"
+ publish_result = publish_markdown(
+ title=f"AI日报 · {run_date}",
+ markdown=stage7_result["markdown"],
+ tags=["AI日报", "AI资讯", "人工智能"],
+ slug=slug,
+ base_url=base_url,
+ mode=mode,
+ markdown_report=stage7_result["reports"]["stage7"],
+ client=client,
+ )
+ reports = dict(stage7_result["reports"])
+ reports["stage8"] = {
+ "mode": publish_result.mode,
+ "status": publish_result.status,
+ "slug": publish_result.slug,
+ "blog_url": publish_result.blog_url,
+ "public_ok": publish_result.public_ok,
+ "error": publish_result.error,
+ }
+ return {
+ "source_results": stage7_result["source_results"],
+ "items": stage7_result["items"],
+ "guide": stage7_result["guide"],
+ "markdown": stage7_result["markdown"],
+ "publish": publish_result,
+ "reports": reports,
+ }
diff --git a/ai_daily_report/publish.py b/ai_daily_report/publish.py
new file mode 100644
index 0000000..7cf3ccd
--- /dev/null
+++ b/ai_daily_report/publish.py
@@ -0,0 +1,90 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Protocol
+
+
+@dataclass
+class PublishResult:
+ mode: str
+ status: str
+ slug: str
+ blog_url: str
+ public_ok: bool = False
+ error: str | None = None
+
+
+class BlogClient(Protocol):
+ def create_post(self, payload: dict[str, Any]) -> dict[str, Any]:
+ ...
+
+ def publish_post(self, slug: str) -> None:
+ ...
+
+
+def dry_run_publish(slug: str, base_url: str) -> PublishResult:
+ return PublishResult(
+ mode="dry-run",
+ status="ok",
+ slug=slug,
+ blog_url=f"{base_url.rstrip('/')}/posts/{slug}",
+ public_ok=True,
+ )
+
+
+def publish_markdown(
+ *,
+ title: str,
+ markdown: str,
+ tags: list[str],
+ slug: str,
+ base_url: str,
+ mode: str,
+ markdown_report: dict[str, Any],
+ client: BlogClient | None,
+) -> PublishResult:
+ blocking_errors = markdown_report.get("blocking_errors", []) or []
+ blog_url = f"{base_url.rstrip('/')}/posts/{slug}"
+ if blocking_errors:
+ return PublishResult(
+ mode=mode,
+ status="blocked",
+ slug=slug,
+ blog_url=blog_url,
+ public_ok=False,
+ error=";".join(blocking_errors),
+ )
+ if mode == "dry-run":
+ return dry_run_publish(slug, base_url)
+ if client is None:
+ return PublishResult(
+ mode=mode,
+ status="failed",
+ slug=slug,
+ blog_url=blog_url,
+ public_ok=False,
+ error="missing_blog_client",
+ )
+
+ payload = {"title": title, "content": markdown, "tags": tags, "slug": slug}
+ try:
+ create_resp = client.create_post(payload)
+ created_slug = create_resp.get("slug") or slug
+ if mode == "publish":
+ client.publish_post(created_slug)
+ return PublishResult(
+ mode=mode,
+ status="ok",
+ slug=created_slug,
+ blog_url=f"{base_url.rstrip('/')}/posts/{created_slug}",
+ public_ok=mode == "publish",
+ )
+ except Exception as exc:
+ return PublishResult(
+ mode=mode,
+ status="failed",
+ slug=slug,
+ blog_url=blog_url,
+ public_ok=False,
+ error=f"{type(exc).__name__}: {exc}",
+ )
diff --git a/ai_daily_report/rewrite.py b/ai_daily_report/rewrite.py
new file mode 100644
index 0000000..6bc9063
--- /dev/null
+++ b/ai_daily_report/rewrite.py
@@ -0,0 +1,103 @@
+from __future__ import annotations
+
+import json
+from typing import Any, Callable
+
+from .llm import parse_json_object
+from .models import NewsItem
+
+
+RewriteLlmCall = Callable[[str], str]
+
+
+def _chunks(items: list[NewsItem], size: int) -> list[list[NewsItem]]:
+ return [items[index : index + size] for index in range(0, len(items), size)]
+
+
+def _build_prompt(batch: list[NewsItem]) -> str:
+ payload = {
+ "task": (
+ "Rewrite AI news titles and summaries into concise Chinese. Preserve brand/model/API names "
+ "such as GPT-5, Codex, Gemini, Claude, API, MCP. Do not add facts."
+ ),
+ "items": [
+ {
+ "id": item.id,
+ "title_raw": item.title_raw,
+ "summary_raw": item.summary_raw,
+ "source": item.source_label,
+ "language_hint": item.language_hint,
+ }
+ for item in batch
+ ],
+ "output_schema": {
+ "rewrites": [
+ {
+ "id": "item id",
+ "title": "display title",
+ "summary": "display summary",
+ "flags": [],
+ }
+ ]
+ },
+ }
+ return json.dumps(payload, ensure_ascii=False)
+
+
+def _fallback(item: NewsItem) -> None:
+ item.title = item.title_raw
+ item.summary = item.summary_raw or "该条目暂无摘要。"
+
+
+def _apply_rewrite_batch(batch: list[NewsItem], llm_call: RewriteLlmCall) -> int:
+ obj = parse_json_object(llm_call(_build_prompt(batch)))
+ rewrites = obj.get("rewrites", [])
+ if not isinstance(rewrites, list):
+ raise ValueError("rewrites is not a list")
+ by_id = {item.id: item for item in batch}
+ seen_ids: set[str] = set()
+ for entry in rewrites:
+ item_id = entry.get("id")
+ title = str(entry.get("title") or "").strip()
+ summary = str(entry.get("summary") or "").strip()
+ if item_id in by_id and title and summary:
+ by_id[item_id].title = title
+ by_id[item_id].summary = summary
+ seen_ids.add(item_id)
+ for item in batch:
+ if item.id not in seen_ids:
+ raise ValueError(f"missing_rewrite_for_item: {item.id}")
+ return len(seen_ids)
+
+
+def rewrite_items(
+ items: list[NewsItem],
+ *,
+ llm_call: RewriteLlmCall,
+ batch_size: int = 10,
+) -> tuple[list[NewsItem], dict[str, Any]]:
+ rewritten_count = 0
+ fallback_count = 0
+ errors: list[str] = []
+
+ for batch in _chunks(items, max(1, batch_size)):
+ try:
+ rewritten_count += _apply_rewrite_batch(batch, llm_call)
+ except Exception as exc:
+ errors.append(f"batch:{type(exc).__name__}: {exc}")
+ for item in batch:
+ try:
+ rewritten_count += _apply_rewrite_batch([item], llm_call)
+ except Exception as item_exc:
+ errors.append(f"item:{item.id}:{type(item_exc).__name__}: {item_exc}")
+ _fallback(item)
+ fallback_count += 1
+
+ report = {
+ "input_count": len(items),
+ "rewritten_count": rewritten_count,
+ "fallback_count": fallback_count,
+ "batch_count": len(_chunks(items, max(1, batch_size))),
+ "errors": errors,
+ }
+ return items, report
diff --git a/ai_daily_report/runner.py b/ai_daily_report/runner.py
new file mode 100644
index 0000000..295316c
--- /dev/null
+++ b/ai_daily_report/runner.py
@@ -0,0 +1,156 @@
+from __future__ import annotations
+
+import json
+from dataclasses import asdict, is_dataclass
+from pathlib import Path
+from typing import Any
+
+from .clients import BlogApiClient, OpenAICompatibleClient, fetch_text as default_fetch_text
+from .config import load_source_configs
+from .env import load_env, resolve_blog_token, resolve_llm_config
+from .models import SourceConfig
+from .pipeline import run_stage0_to_stage8
+from .sources.registry import get_source_fetcher
+
+
+def _json_default(value: Any):
+ if is_dataclass(value):
+ return asdict(value)
+ raise TypeError(f"Object is not JSON serializable: {type(value).__name__}")
+
+
+def _mock_source_configs() -> list[SourceConfig]:
+ return [SourceConfig(name="Mock AI HOT", type="mock", role="primary", priority=10)]
+
+
+def _mock_fetcher(config: SourceConfig, run_date: str) -> list[dict[str, Any]]:
+ return [
+ {
+ "title_raw": "GPT-5 API 发布",
+ "summary_raw": "OpenAI 发布 GPT-5 API,用于本地 mock 测试。",
+ "url": "https://example.com/gpt5",
+ "source_label": "OpenAI:Blog",
+ "section_hint": "模型发布/更新",
+ "origin_type": "mock",
+ "language_hint": "zh",
+ }
+ ]
+
+
+def _mock_semantic_llm(prompt: str) -> str:
+ return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []}, ensure_ascii=False)
+
+
+def _mock_rewrite_llm(prompt: str) -> str:
+ payload = json.loads(prompt)
+ return json.dumps(
+ {
+ "rewrites": [
+ {
+ "id": item["id"],
+ "title": item["title_raw"],
+ "summary": item["summary_raw"],
+ "flags": [],
+ }
+ for item in payload["items"]
+ ]
+ },
+ ensure_ascii=False,
+ )
+
+
+def _mock_guide_llm(prompt: str) -> str:
+ payload = json.loads(prompt)
+ item_ids = [item["id"] for item in payload["items"][:3]]
+ return json.dumps(
+ {
+ "theme": "本地 mock 模式已生成 AI 日报,用于验证流水线。",
+ "threads": [
+ {
+ "title": "本地链路验证",
+ "text": "采集、改写、分类、导览、Markdown 和发布报告都已通过 mock 数据串联。",
+ "item_ids": item_ids,
+ "kind": "thread",
+ }
+ ],
+ },
+ ensure_ascii=False,
+ )
+
+
+def run_daily_report(
+ *,
+ run_date: str,
+ mode: str,
+ source_mode: str,
+ llm_mode: str,
+ out_dir: Path,
+ base_url: str,
+ sources_path: Path | None = None,
+ fetch_text=None,
+ env: dict[str, str] | None = None,
+ llm_client_factory=OpenAICompatibleClient,
+ blog_client_factory=BlogApiClient,
+) -> dict[str, Any]:
+ fetch_text = fetch_text or default_fetch_text
+ env = env if env is not None else load_env()
+
+ if source_mode == "mock":
+ source_configs = _mock_source_configs()
+ fetcher = _mock_fetcher
+ elif source_mode == "live":
+ if sources_path is None:
+ sources_path = Path("config") / "sources.json"
+ source_configs = load_source_configs(sources_path)
+
+ def fetcher(config: SourceConfig, current_date: str) -> list[dict[str, Any]]:
+ source_fetcher = get_source_fetcher(config.type)
+ return source_fetcher(config, current_date, fetch_text)
+
+ else:
+ raise ValueError("source_mode must be 'mock' or 'live'")
+
+ if llm_mode == "mock":
+ semantic_llm_call = _mock_semantic_llm
+ rewrite_llm_call = _mock_rewrite_llm
+ guide_llm_call = _mock_guide_llm
+ elif llm_mode == "live":
+ llm_client = llm_client_factory(**resolve_llm_config(env))
+ semantic_llm_call = llm_client.chat
+ rewrite_llm_call = llm_client.chat
+ guide_llm_call = llm_client.chat
+ else:
+ raise ValueError("llm_mode must be 'mock' or 'live'")
+
+ blog_client = None
+ if mode in ("draft", "publish"):
+ token = resolve_blog_token(env)
+ if not token:
+ raise ValueError("missing_blog_token: set BLOG_SERVICE_TOKEN or EPHRON_SERVICE_TOKEN")
+ blog_client = blog_client_factory(base_url=base_url, token=token)
+
+ result = run_stage0_to_stage8(
+ source_configs,
+ run_date,
+ fetcher=fetcher,
+ semantic_llm_call=semantic_llm_call,
+ rewrite_llm_call=rewrite_llm_call,
+ guide_llm_call=guide_llm_call,
+ mode=mode,
+ base_url=base_url,
+ client=blog_client,
+ )
+
+ run_dir = out_dir / run_date
+ run_dir.mkdir(parents=True, exist_ok=True)
+ (run_dir / "blog_markdown.md").write_text(result["markdown"], encoding="utf-8")
+ (run_dir / "run_report.json").write_text(
+ json.dumps(result["reports"], ensure_ascii=False, indent=2, default=_json_default),
+ encoding="utf-8",
+ )
+ return {
+ "run_dir": str(run_dir),
+ "markdown": result["markdown"],
+ "reports": result["reports"],
+ "publish": result["publish"],
+ }
diff --git a/ai_daily_report/semantic_dedupe.py b/ai_daily_report/semantic_dedupe.py
new file mode 100644
index 0000000..815d298
--- /dev/null
+++ b/ai_daily_report/semantic_dedupe.py
@@ -0,0 +1,167 @@
+from __future__ import annotations
+
+import json
+from typing import Any, Callable
+
+from .llm import parse_json_object
+from .models import NewsItem
+
+
+SemanticLlmCall = Callable[[str], str]
+
+
+def _build_prompt(items: list[NewsItem], candidates: list[dict[str, Any]]) -> str:
+ item_payload = [
+ {
+ "id": item.id,
+ "title": item.title or item.title_raw,
+ "summary": item.summary or item.summary_raw,
+ "source": item.source_label,
+ "section_hint": item.section_hint,
+ }
+ for item in items
+ ]
+ prompt = {
+ "task": "Identify only high-confidence semantic duplicates. Do not curate or remove by importance.",
+ "items": item_payload,
+ "candidates": candidates,
+ "output_schema": {
+ "duplicate_groups": [
+ {
+ "keep_id": "item id",
+ "remove_ids": ["item id"],
+ "confidence": "high|medium|low",
+ "reason": "same concrete event reason",
+ }
+ ],
+ "not_duplicates": [],
+ "uncertain": [],
+ },
+ }
+ return json.dumps(prompt, ensure_ascii=False)
+
+
+def _score(item: NewsItem) -> int:
+ score = max(0, 200 - item.source_priority)
+ if item.source_role == "primary":
+ score += 10
+ if item.summary_raw:
+ score += min(40, len(item.summary_raw))
+ if item.canonical_url:
+ score += 20
+ score -= len(item.quality_flags) * 10
+ return score
+
+
+def _choose_keep(group_items: list[NewsItem], suggested_keep_id: str) -> NewsItem:
+ suggested = [item for item in group_items if item.id == suggested_keep_id]
+ if suggested:
+ best = max(group_items, key=_score)
+ if _score(suggested[0]) >= _score(best) - 10:
+ return suggested[0]
+ return max(group_items, key=_score)
+
+
+def semantic_dedup_items(
+ items: list[NewsItem],
+ candidates: list[dict[str, Any]],
+ *,
+ llm_call: SemanticLlmCall,
+ max_deletion_ratio: float = 0.5,
+) -> tuple[list[NewsItem], dict[str, Any]]:
+ if not items or not candidates:
+ return items, {
+ "input_count": len(items),
+ "candidate_group_count": len(candidates),
+ "removed_count": 0,
+ "duplicate_groups": [],
+ "uncertain": [],
+ "errors": [],
+ "skipped_for_deletion_ratio": False,
+ }
+
+ errors: list[str] = []
+ try:
+ obj = parse_json_object(llm_call(_build_prompt(items, candidates)))
+ except Exception as exc:
+ return items, {
+ "input_count": len(items),
+ "candidate_group_count": len(candidates),
+ "removed_count": 0,
+ "duplicate_groups": [],
+ "uncertain": [],
+ "errors": [f"{type(exc).__name__}: {exc}"],
+ "skipped_for_deletion_ratio": False,
+ }
+
+ by_id = {item.id: item for item in items}
+ candidate_sets = {
+ frozenset(item_id for item_id in candidate.get("item_ids", []) if isinstance(item_id, str))
+ for candidate in candidates
+ }
+ candidate_removals: set[str] = set()
+ valid_groups: list[dict[str, Any]] = []
+
+ for group in obj.get("duplicate_groups", []) or []:
+ if group.get("confidence") != "high":
+ continue
+ ids = [group.get("keep_id")] + list(group.get("remove_ids") or [])
+ if any(not isinstance(item_id, str) or item_id not in by_id for item_id in ids):
+ errors.append(f"invalid_ids_in_group: {group}")
+ continue
+ group_set = frozenset(ids)
+ if not any(group_set.issubset(candidate_set) for candidate_set in candidate_sets):
+ errors.append(f"group_outside_candidates: {group}")
+ continue
+ group_items = [by_id[item_id] for item_id in ids]
+ keep = _choose_keep(group_items, str(group.get("keep_id")))
+ remove_items = [item for item in group_items if item is not keep]
+ candidate_removals.update(item.id for item in remove_items)
+ valid_groups.append(
+ {
+ "keep_id": keep.id,
+ "remove_ids": [item.id for item in remove_items],
+ "confidence": "high",
+ "reason": str(group.get("reason") or "semantic_duplicate"),
+ }
+ )
+
+ deletion_ratio = len(candidate_removals) / len(items) if items else 0
+ if deletion_ratio > max_deletion_ratio:
+ return items, {
+ "input_count": len(items),
+ "candidate_group_count": len(candidates),
+ "removed_count": 0,
+ "duplicate_groups": valid_groups,
+ "uncertain": obj.get("uncertain", []) or [],
+ "errors": errors,
+ "skipped_for_deletion_ratio": True,
+ }
+
+ removed_ids: set[str] = set()
+ for group in valid_groups:
+ keep = by_id[group["keep_id"]]
+ for remove_id in group["remove_ids"]:
+ removed = by_id[remove_id]
+ keep.duplicate_sources.append(
+ {
+ "id": removed.id,
+ "source_group": removed.source_group,
+ "source_label": removed.source_label,
+ "url": removed.url,
+ "reason": group["reason"],
+ }
+ )
+ removed_ids.add(remove_id)
+
+ deduped = [item for item in items if item.id not in removed_ids]
+ report = {
+ "input_count": len(items),
+ "candidate_group_count": len(candidates),
+ "removed_count": len(removed_ids),
+ "duplicate_groups": valid_groups,
+ "uncertain": obj.get("uncertain", []) or [],
+ "errors": errors,
+ "skipped_for_deletion_ratio": False,
+ }
+ return deduped, report
diff --git a/ai_daily_report/sources/__init__.py b/ai_daily_report/sources/__init__.py
new file mode 100644
index 0000000..54ac9e1
--- /dev/null
+++ b/ai_daily_report/sources/__init__.py
@@ -0,0 +1,2 @@
+"""Source adapters for the AI daily report pipeline."""
+
diff --git a/ai_daily_report/sources/aihot.py b/ai_daily_report/sources/aihot.py
new file mode 100644
index 0000000..9c13d55
--- /dev/null
+++ b/ai_daily_report/sources/aihot.py
@@ -0,0 +1,32 @@
+from __future__ import annotations
+
+import json
+from typing import Any, Callable
+
+from ai_daily_report.models import SourceConfig
+
+
+FetchText = Callable[[str, int], str]
+
+
+def fetch_aihot(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
+ data = json.loads(fetch_text(f"https://aihot.virxact.com/api/public/daily/{run_date}", config.timeout_seconds))
+ items: list[dict[str, Any]] = []
+ generated = data.get("generatedAt")
+ for section in data.get("sections", []) or []:
+ for raw in section.get("items", []) or []:
+ items.append(
+ {
+ "source_group": config.name,
+ "source_label": raw.get("sourceName") or config.name,
+ "title_raw": raw.get("title") or "",
+ "summary_raw": raw.get("summary") or "",
+ "url": raw.get("sourceUrl") or "",
+ "published_at": generated,
+ "origin_type": "aihot_json",
+ "section_hint": section.get("label") or "",
+ "language_hint": "zh",
+ }
+ )
+ return items
+
diff --git a/ai_daily_report/sources/juya.py b/ai_daily_report/sources/juya.py
new file mode 100644
index 0000000..533fbbf
--- /dev/null
+++ b/ai_daily_report/sources/juya.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+import re
+import xml.etree.ElementTree as ET
+from typing import Any, Callable
+
+from ai_daily_report.models import SourceConfig
+from ai_daily_report.normalize import clean_text
+from ai_daily_report.sources.labels import source_label_from_url
+
+
+FetchText = Callable[[str, int], str]
+
+
+def parse_juya_rss(config: SourceConfig, xml_text: str, run_date: str) -> list[dict[str, Any]]:
+ root = ET.fromstring(xml_text)
+ channel = root.find("channel")
+ raw_items = channel.findall("item") if channel is not None else []
+ article_html = ""
+ for raw in raw_items:
+ if (raw.findtext("title") or "").strip() != run_date:
+ continue
+ content_el = raw.find("{http://purl.org/rss/1.0/modules/content/}encoded")
+ article_html = content_el.text if content_el is not None and content_el.text else ""
+ break
+ if not article_html:
+ return []
+
+ block_pattern = re.compile(
+ r'(?P.*?)(?=
\s*
|||
', '\n', body_text, flags=re.I)
- body_text = re.sub(r']*>', '', body_text, flags=re.I)
- body_text = re.sub(r']+>.*?', ' ', body_text, flags=re.S | re.I)
- body_text = re.sub(r'
]*>', ' ', body_text, flags=re.I)
- body_text = re.sub(r'<[^>]+>', ' ', body_text)
- lines = [clean_text(x) for x in body_text.split('\n') if clean_text(x)]
- summary_lines = []
- for line in lines:
- if line.startswith('相关链接'):
- break
- if line == title:
- continue
- summary_lines.append(line)
- summary = ' '.join(summary_lines[:4]).strip()
- if not title:
- continue
- results.append({
- 'source_group': '橘鸦AI早报',
- 'source_label': source_name_from_url(url, '橘鸦AI早报') if url and 'imjuya.github.io/juya-ai-daily' not in url else '橘鸦AI早报',
- 'title_raw': title,
- 'summary_raw': summary,
- 'url': url,
- 'published_at': pub.isoformat() if pub else None,
- 'origin_type': 'juya_issue',
- 'section_hint': '',
- 'language_hint': 'zh',
- })
- return results
-
-
-# ─── LLM infrastructure (unchanged) ─────────────────────────────────────────
-
-def load_env():
- env = {}
- env_path = Path.home() / '.hermes' / '.env'
- if env_path.exists():
- text = env_path.read_text(errors='ignore')
- for line in text.splitlines():
- if '=' in line and not line.strip().startswith('#'):
- k, v = line.split('=', 1)
- env[k.strip()] = v.strip()
- env.update({k: v for k, v in os.environ.items() if v})
+def load_env() -> dict[str, str]:
+ env: dict[str, str] = {}
+ env.update(read_env_file(PROJECT_ENV_PATH))
+ env.update(read_env_file(Path.home() / ".hermes" / ".env"))
+ env.update({key: value for key, value in os.environ.items() if value})
return env
-def resolve_llm_config(env: dict):
- """Read Hermes config to get the active provider's API key, base_url, and model.
+def is_dry_run(env: dict[str, str]) -> bool:
+ return (env.get("AI_DAILY_DRY_RUN") or "").strip().lower() in {"1", "true", "yes"}
- Priority:
- 1) Explicit environment overrides for this pipeline (SUB2API / LLM_* / XIAOMI_* / XIAOMI_MIMO_*)
- 2) Hermes model config (config.yaml)
- 3) auth.json credential pool
- 4) Legacy env fallbacks
- """
- import yaml
- hermes_dir = Path.home() / '.hermes'
+def requires_blog_token(env: dict[str, str]) -> bool:
+ return not is_dry_run(env)
- def first_env(*names: str) -> str:
- for name in names:
- val = (env.get(name) or '').strip()
- if val:
- return val
- return ''
- # Allow this script to be pinned to the current Hermes model config.
- cfg_path = hermes_dir / 'config.yaml'
- cfg = {}
- if cfg_path.exists():
- with open(cfg_path) as f:
- cfg = yaml.safe_load(f) or {}
+def main() -> None:
+ from ai_daily_report.runner import run_daily_report
- model_cfg = cfg.get('model', {}) or {}
- provider = (model_cfg.get('provider') or '').strip()
- base_url = (model_cfg.get('base_url') or '').rstrip('/')
- model_name = (model_cfg.get('default') or '').strip()
-
- # 1) Explicit overrides for this pipeline take precedence, but keep endpoint/key/model
- # from the same provider family. Mixing SUB2API_API_KEY with XIAOMI_BASE_URL causes
- # 401 after switching Hermes to a Sub2API model.
- explicit_api_key = first_env('LLM_API_KEY')
- explicit_base_url = first_env('LLM_BASE_URL')
- explicit_model = first_env('LLM_MODEL')
-
- if not explicit_api_key:
- if provider == 'sub2api' or first_env('SUB2API_API_KEY', 'SUB2API_BASE_URL', 'SUB2API_MODEL'):
- explicit_api_key = first_env('SUB2API_API_KEY')
- explicit_base_url = first_env('SUB2API_BASE_URL') or base_url
- explicit_model = first_env('SUB2API_MODEL') or model_name
- elif first_env('XIAOMI_API_KEY', 'XIAOMI_MIMO_API_KEY', 'XIAOMI_BASE_URL', 'XIAOMI_MIMO_BASE_URL', 'XIAOMI_MODEL', 'XIAOMI_MIMO_MODEL'):
- explicit_api_key = first_env('XIAOMI_API_KEY', 'XIAOMI_MIMO_API_KEY')
- explicit_base_url = first_env('XIAOMI_BASE_URL', 'XIAOMI_MIMO_BASE_URL')
- explicit_model = first_env('XIAOMI_MODEL', 'XIAOMI_MIMO_MODEL')
-
- if explicit_base_url:
- base_url = explicit_base_url.rstrip('/')
- if explicit_model:
- model_name = explicit_model
-
- provider_def = (cfg.get('providers', {}) or {}).get(provider, {}) or {}
- if not base_url and provider_def.get('base_url'):
- base_url = str(provider_def.get('base_url')).rstrip('/')
- if not explicit_api_key and provider_def.get('key_env'):
- explicit_api_key = first_env(str(provider_def.get('key_env')))
-
- # Fast fallback chain: if the active provider has no credentials, use a known-good
- # provider/model from auth.json so the daily cron keeps publishing.
- fallback_provider = first_env('LLM_FALLBACK_PROVIDER', 'XIAOMI_FALLBACK_PROVIDER') or 'openrouter'
-
- api_key = explicit_api_key
- auth_path = hermes_dir / 'auth.json'
- if not api_key and auth_path.exists():
- with open(auth_path) as f:
- auth = json.load(f)
- pool = auth.get('credential_pool', {}) or {}
- provider_keys = []
- if provider:
- provider_keys.extend([provider, provider.replace('-', '_')])
- # Known aliases for this environment.
- provider_keys.extend(['sub2api', 'xiaomi', 'xiaomi_mimo', 'sensenova'])
- for pkey in provider_keys:
- creds = pool.get(pkey, [])
- if creds:
- cred = creds[0]
- source = cred.get('source', '')
- if source.startswith('env:'):
- env_var = source[4:]
- api_key = env.get(env_var, '') or api_key
- if not api_key:
- api_key = cred.get('access_token', '') or api_key
- if not base_url:
- base_url = (cred.get('base_url') or '').rstrip('/')
- if not model_name:
- model_name = cred.get('model', '') or model_name
- break
-
- # 3) Legacy env fallbacks.
- if not api_key:
- api_key = first_env('LLM_API_KEY', 'XIAOMI_API_KEY', 'XIAOMI_MIMO_API_KEY', 'OPENROUTER_API_KEY')
- if not base_url:
- base_url = first_env('LLM_BASE_URL', 'XIAOMI_BASE_URL', 'XIAOMI_MIMO_BASE_URL', 'OPENROUTER_BASE_URL').rstrip('/')
- if not model_name:
- model_name = first_env('LLM_MODEL') or 'mimo-v2.5-pro'
-
- if not api_key and fallback_provider and auth_path.exists():
- with open(auth_path) as f:
- auth = json.load(f)
- pool = auth.get('credential_pool', {}) or {}
- for pkey in [fallback_provider, fallback_provider.replace('-', '_')]:
- creds = pool.get(pkey, [])
- if creds:
- cred = creds[0]
- source = cred.get('source', '')
- if source.startswith('env:'):
- env_var = source[4:]
- api_key = env.get(env_var, '') or api_key
- if not api_key:
- api_key = cred.get('access_token', '') or api_key
- if not base_url:
- base_url = (cred.get('base_url') or '').rstrip('/')
- if not model_name:
- model_name = cred.get('model', '') or model_name
- provider = fallback_provider
- break
-
- if not api_key:
- raise RuntimeError(
- f'No API key found for provider "{provider}" or fallback "{fallback_provider}". '
- 'Set SUB2API_API_KEY / XIAOMI_API_KEY / LLM_API_KEY or fix ~/.hermes/auth.json'
- )
- if not base_url:
- raise RuntimeError(
- f'No base_url found for provider "{provider}" or fallback "{fallback_provider}". '
- 'Set SUB2API_BASE_URL / XIAOMI_BASE_URL / LLM_BASE_URL or fix ~/.hermes/auth.json'
- )
-
- return api_key, base_url, model_name
-
-
-def _try_llm_request(base_url: str, api_key: str, model: str, prompt_text: str, auth_mode: str, api_key_header: str = 'Authorization'):
- payload = json.dumps({
- 'model': model,
- 'messages': [{'role': 'user', 'content': prompt_text}],
- 'temperature': 0.2,
- 'max_tokens': 8000,
- }, ensure_ascii=False).encode('utf-8')
- headers = {'Content-Type': 'application/json'}
- if api_key_header == 'Authorization':
- headers[api_key_header] = f'Bearer {api_key}' if auth_mode == 'bearer' else api_key
- else:
- headers[api_key_header] = api_key
- req = urllib.request.Request(f'{base_url}/chat/completions', data=payload, headers=headers)
- with urllib.request.urlopen(req, timeout=600) as r:
- resp = json.loads(r.read().decode('utf-8'))
- return resp['choices'][0]['message']['content'].strip()
-
-
-def llm_call(prompt_text: str, env: dict) -> str:
- api_key, base_url, model = resolve_llm_config(env)
-
- # Use a single, explicit path so cron behavior is easy to debug.
- # The earlier auth-matrix/fallback logic was making failures harder to reason about.
- payload = json.dumps({
- 'model': model,
- 'messages': [{'role': 'user', 'content': prompt_text}],
- 'temperature': 0.2,
- 'max_tokens': 8000,
- }, ensure_ascii=False).encode('utf-8')
-
- req = urllib.request.Request(
- f'{base_url}/chat/completions',
- data=payload,
- headers={'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'},
- )
- print(f'llm_call request: base_url={base_url}; model={model}', file=sys.stderr)
- try:
- with urllib.request.urlopen(req, timeout=600) as r:
- resp = json.loads(r.read().decode('utf-8'))
- return resp['choices'][0]['message']['content'].strip()
- except urllib.error.HTTPError as e:
- body = ''
- try:
- body = e.read().decode('utf-8', 'ignore')
- except Exception:
- pass
- print(f'llm_call failed: HTTP {e.code} {e.reason}; base_url={base_url}; model={model}; body={body[:500]}', file=sys.stderr)
- raise
-
-
-def _parse_json_from_llm(text: str):
- """Strip markdown code blocks and extract a JSON object from LLM output."""
- text = re.sub(r'^```(?:json)?\s*\n?', '', text)
- text = re.sub(r'\n?```\s*$', '', text)
- text = text.strip()
- m = re.search(r'\{.*\}\s*$', text, re.S)
- if not m:
- raise ValueError('LLM 输出中未找到 JSON 对象')
- raw_json = m.group(0)
- raw_json = re.sub(r',\s*([}\]])', r'\1', raw_json)
- return json.loads(raw_json)
-
-
-def _normalize_title(title: str) -> str:
- """Normalize a title for dedup comparison: strip non-alphanumeric, lowercase."""
- return re.sub(r'[^\w\u4e00-\u9fff]+', '', (title or '').lower())
-
-
-# ─── Stage 0: Script dedup (no LLM) ────────────────────────────────────────
-
-def stage0_script_dedup(raw_items: list) -> list:
- """Deduplicate using difflib.SequenceMatcher on normalized titles.
- Similarity > 0.7 means same event; keep the one with longer summary."""
- if not raw_items:
- return []
-
- # Build list of (normalized_title, item)
- normed = []
- for item in raw_items:
- nt = _normalize_title(item.get('title_raw', ''))
- if nt and len(nt) >= 3:
- normed.append((nt, item))
-
- keep = [] # list of (nt, item) to keep
- for nt, item in normed:
- merged = False
- for i, (knt, kitem) in enumerate(keep):
- ratio = difflib.SequenceMatcher(None, nt, knt).ratio()
- if ratio > 0.7:
- # Same event — keep the one with longer summary
- if len(item.get('summary_raw', '')) > len(kitem.get('summary_raw', '')):
- keep[i] = (nt, item)
- merged = True
- break
- if not merged:
- keep.append((nt, item))
-
- return [item for _, item in keep]
-
-
-# ─── Stage 1: LLM semantic dedup ───────────────────────────────────────────
-
-def stage1_llm_dedup(items: list, env: dict):
- """Use LLM to identify semantic duplicates. Returns (filtered_items, error)."""
- if not items:
- return items, None
-
- indexed = []
- for i, item in enumerate(items):
- indexed.append({
- 'index': i,
- 'title': item.get('title_raw', '')[:80],
- 'summary': item.get('summary_raw', '')[:120],
- })
-
- prompt = (
- '以下是AI领域的新闻条目。有些条目虽然措辞不同,但描述的是同一个事件。'
- '请识别重复项,输出要保留的条目索引列表。只有描述完全相同的具体事件才视为重复。\n\n'
- f'{json.dumps(indexed, ensure_ascii=False)}\n\n'
- '请严格按以下JSON格式输出,不要包含任何其他内容:\n'
- '{"keep_indices": [0, 1, 3, 5]}'
- )
-
- try:
- raw = llm_call(prompt, env)
- obj = _parse_json_from_llm(raw)
- indices = obj.get('keep_indices', [])
- if not isinstance(indices, list):
- raise ValueError('keep_indices is not a list')
- # Filter valid indices
- valid = sorted(set(i for i in indices if isinstance(i, int) and 0 <= i < len(items)))
- if not valid:
- raise ValueError('No valid indices in keep_indices')
- return [items[i] for i in valid], None
- except Exception as e:
- err = f'stage1_llm_dedup failed: {type(e).__name__}: {e}'
- print(err)
- return items, err # Fallback: return all items unchanged
-
-
-# ─── Stage 2a: LLM summary rewrite (parallel) ──────────────────────────────
-
-def stage2a_rewrite_summaries(items: list, env: dict):
- """Rewrite summaries in concise Chinese. Returns (updated_items, error)."""
- if not items:
- return items, None
-
- indexed = []
- for i, item in enumerate(items):
- indexed.append({
- 'index': i,
- 'title': item.get('title_raw', '')[:80],
- 'summary': item.get('summary_raw', '')[:200],
- })
-
- prompt = (
- '请将以下新闻条目的标题和摘要改写为简洁中文。'
- '标题:英文品牌名/模型名保留原样(如GPT-5、Codex),其余翻译为中文。'
- '摘要:每条最多120字,保留核心事实。\n\n'
- f'{json.dumps(indexed, ensure_ascii=False)}\n\n'
- '请严格按以下JSON格式输出:\n'
- '{"summaries": [{"index": 0, "title": "中文标题", "summary": "改写后的摘要"}, ...]}'
- )
-
- try:
- raw = llm_call(prompt, env)
- obj = _parse_json_from_llm(raw)
- summaries = obj.get('summaries', [])
- if not isinstance(summaries, list):
- raise ValueError('summaries is not a list')
-
- result = [dict(item) for item in items] # shallow copy
- for entry in summaries:
- idx = entry.get('index')
- s = entry.get('summary', '')
- t = entry.get('title', '')
- if isinstance(idx, int) and 0 <= idx < len(result):
- if t:
- result[idx] = dict(result[idx], title_raw=t)
- if s:
- result[idx] = dict(result[idx], summary_raw=s)
-
- return result, None
- except Exception as e:
- err = f'stage2a_rewrite_summaries failed: {type(e).__name__}: {e}'
- print(err)
- return items, err # Fallback: return items unchanged
-
-
-# ─── Stage 2b: LLM classify (parallel) ──────────────────────────────────────
-
-def stage2b_classify(items: list, env: dict):
- """Classify each item into a section. Returns (updated_items, error)."""
- if not items:
- return items, None
-
- indexed = []
- for i, item in enumerate(items):
- indexed.append({
- 'index': i,
- 'title': item.get('title_raw', '')[:80],
- 'summary': item.get('summary_raw', '')[:120],
- })
-
- sections_str = '、'.join(SECTION_ORDER)
- prompt = (
- f'请将以下AI新闻条目分类到对应板块。\n'
- f'可选板块:{sections_str}\n\n'
- f'{json.dumps(indexed, ensure_ascii=False)}\n\n'
- '请严格按以下JSON格式输出:\n'
- '{"sections": [{"index": 0, "section": "模型发布/更新"}, ...]}'
- )
-
- try:
- raw = llm_call(prompt, env)
- obj = _parse_json_from_llm(raw)
- sections = obj.get('sections', [])
- if not isinstance(sections, list):
- raise ValueError('sections is not a list')
-
- result = [dict(item) for item in items] # shallow copy
- for entry in sections:
- idx = entry.get('index')
- sec = entry.get('section', '')
- if isinstance(idx, int) and 0 <= idx < len(result) and sec:
- if sec in SECTION_ORDER:
- result[idx] = dict(result[idx], section_hint=sec)
-
- return result, None
- except Exception as e:
- err = f'stage2b_classify failed: {type(e).__name__}: {e}'
- print(err)
- return items, err # Fallback: return items unchanged
-
-
-# ─── Stage 2 parallel execution ─────────────────────────────────────────────
-
-def stage2_parallel(items: list, env: dict):
- """Run stage2a (summary rewrite) and stage2b (classify) in parallel.
- Returns (merged_items, errors_list)."""
- errors = []
- summaries_result = items
- classify_result = items
-
- with ThreadPoolExecutor(max_workers=2) as executor:
- future_summaries = executor.submit(stage2a_rewrite_summaries, items, env)
- future_classify = executor.submit(stage2b_classify, items, env)
-
- # Wait for summary rewrite
- try:
- summaries_result, err = future_summaries.result()
- if err:
- errors.append(err)
- except Exception as e:
- errors.append(f'stage2a exception: {type(e).__name__}: {e}')
-
- # Wait for classify
- try:
- classify_result, err = future_classify.result()
- if err:
- errors.append(err)
- except Exception as e:
- errors.append(f'stage2b exception: {type(e).__name__}: {e}')
-
- # Merge: take summaries from stage2a, sections from stage2b
- merged = []
- for i in range(len(items)):
- new_item = dict(summaries_result[i]) if i < len(summaries_result) else dict(items[i])
- # Apply section from classify result if available
- if i < len(classify_result) and classify_result[i].get('section_hint'):
- new_item['section_hint'] = classify_result[i]['section_hint']
- merged.append(new_item)
-
- return merged, errors
-
-
-# ─── Stage 3: LLM guide/observation ────────────────────────────────────────
-
-def llm_generate_guide(items, today: str, env: dict) -> str:
- """Generate editorial judgment section: main theme + signals + risk."""
- indexed = []
- for i, item in enumerate(items, 1):
- indexed.append({
- 'n': i,
- 'title': item['title'],
- 'summary': item['summary'][:100],
- 'section': item['section'],
- 'source': item.get('source', ''),
- })
- prompt = {
- 'date': today,
- 'task': (
- '你是AI行业编辑。根据以下已经分类和摘要改写好的条目,写「今日观察」。\n\n'
- '格式要求:\n'
- '【主线】blockquote格式,一句话概括今天最值得关注的趋势(不要套话,要具体)\n'
- '【强信号】2-3条,每条格式:编号. 标题(一句话)+ 一两句说明为什么重要\n'
- '【中信号】1-2条,格式同上\n'
- '【待验证】1-2条,格式同上,说明为什么存疑\n\n'
- '写作要求:\n'
- '- 不要空泛总结(如"行业焦点转向XX"),要指向具体事件\n'
- '- 不要引用编号如[1][3],读者看不到对应关系\n'
- '- 不要建议("开发者应该..."之类删掉)\n'
- '- 每条控制在2-3句话以内\n'
- '- 用大白话,不要学术腔\n'
- ),
- 'items': indexed,
- 'rule': '只输出观察文本,不要代码块、不要JSON。严格使用【主线】【强信号】【中信号】【待验证】四个标记。'
- }
- query = json.dumps(prompt, ensure_ascii=False)
- try:
- text = llm_call(query, env)
- text = re.sub(r'^```(?:\w+)?\s*\n?', '', text)
- text = re.sub(r'\n?```\s*$', '', text)
- text = text.strip().strip('"').strip("'")
- return text
- except Exception:
- return ''
-
-
-# ─── Rendering helpers (unchanged) ──────────────────────────────────────────
-
-def _parse_guide_sections(guide: str):
- """Parse guide text into structured sections by 【markers】."""
- sections = {}
- parts = re.split(r'【(主线|强信号|中信号|待验证|建议)】', guide)
- i = 1
- while i < len(parts) - 1:
- key = parts[i].strip()
- content = parts[i + 1].strip()
- sections[key] = content
- i += 2
- return sections
-
-
-def _make_ref_factory(items):
- """Create a [N] → link converter bound to the items list."""
- def make_ref(m):
- idx = int(m.group(1))
- if 1 <= idx <= len(items):
- item = items[idx - 1]
- url = item.get('url', '')
- if url:
- return f'[{idx}]'
- return f'[{idx}]'
- return m.group(0)
- return make_ref
-
-
-def _render_guide_section(lines, title, text, items, is_quote=False):
- """Render a guide section with title on its own line, content below."""
- make_ref = _make_ref_factory(items)
- lines.append(f'**{title}**')
- lines.append('')
- for gline in text.split('\n'):
- gline = gline.strip()
- if not gline:
- continue
- gline = re.sub(r'\[(\d+)\]', make_ref, gline)
- gline = re.sub(r'\[N\]', '', gline)
- gline = gline.strip()
- if not gline:
- continue
- if is_quote:
- lines.append(f'> {gline}')
- else:
- lines.append(gline)
- lines.append('')
-
-
-def format_source_link(item):
- source = item.get('source') or '来源'
- url = item.get('url') or ''
- if url:
- return f'[{source} ↗]({url})'
- return source
-
-
-def blog_markdown(items, guide=None):
- grouped = {k: [] for k in SECTION_ORDER}
- for item in items:
- grouped.setdefault(item['section'], []).append(item)
- n = 1
- lines = []
-
- guide_items = guide if isinstance(guide, list) else []
- make_ref = _make_ref_factory(items)
-
- def clean_guide_text(text):
- text = re.sub(r'\[\d+\]', '', text)
- text = re.sub(r'\[N\]', '', text).strip()
- text = re.sub(r'^主线判断[::]\s*', '', text)
- text = re.sub(r'\s+', ' ', text).strip()
- return text
-
- # === Top: 导览 (theme only) ===
- theme_items = [g for g in guide_items if g.get('type') == 'theme']
- if theme_items:
- lines.append('## 导览')
- lines.append('')
- for g in theme_items:
- text = clean_guide_text(g.get('text', ''))
- if text:
- for para in text.split('\n'):
- para = para.strip()
- if para:
- lines.append(f'> {para}')
- lines.append('')
-
- # === News sections ===
- for sec in SECTION_ORDER:
- sec_items = grouped.get(sec, [])
- if not sec_items:
- continue
- lines.append(f'## {sec}')
- lines.append('')
- for item in sec_items:
- summary = item['summary'].strip()
- if len(summary) > 120:
- summary = summary[:120].rstrip() + '…'
- source_link = format_source_link(item)
- if summary and summary[-1] not in '。!?…':
- summary += '。'
- lines.append(f'**{n}. {item["title"]}**')
- lines.append('')
- lines.append(f'> {summary}{source_link}')
- lines.append('')
- n += 1
-
- # === Bottom: 总结 (strong/medium/risk) ===
- type_labels = {'strong': '强信号', 'medium': '中信号', 'risk': '待验证'}
- summary_types = ['strong', 'medium', 'risk']
- summary_items = [g for g in guide_items if g.get('type') in summary_types]
- if summary_items:
- lines.append('## 总结')
- lines.append('')
- for t in summary_types:
- type_items = [g for g in summary_items if g.get('type') == t]
- if not type_items:
- continue
- label = type_labels.get(t, t)
- lines.append(f'**{label}**')
- lines.append('')
- for g in type_items:
- text = clean_guide_text(g.get('text', ''))
- if not text:
- continue
- title_match = re.search(r'^(.+?)[::]\s*', text)
- if title_match and len(title_match.group(1)) < 60:
- title = title_match.group(1).strip()
- content = text[title_match.end():].strip()
- else:
- sentences = re.split(r'[。!?]', text)
- title = sentences[0].strip() if sentences else text[:40]
- content = text[len(sentences[0]):].strip()
- if content and content[0] in '。!?':
- content = content[1:].strip()
- lines.append(f'- **{title}**')
- if content:
- lines.append(f' {content}')
- lines.append('')
-
- return '\n'.join(lines).strip()
-
-
-def short_summary(blog_url):
- return f'AI日报已发布 👉 {blog_url}'
-
-
-def blog_api_request(method, path, payload=None, token=None, base_url=None):
- url = base_url.rstrip('/') + path
- data = None
- headers = {'Authorization': f'Bearer {token}', 'User-Agent': UA}
- if payload is not None:
- data = json.dumps(payload, ensure_ascii=False).encode('utf-8')
- headers['Content-Type'] = 'application/json'
- req = urllib.request.Request(url, data=data, headers=headers, method=method)
- with urllib.request.urlopen(req, timeout=25) as r:
- return json.loads(r.read().decode('utf-8'))
-
-
-# ─── Main pipeline ──────────────────────────────────────────────────────────
-
-def main():
env = load_env()
- token = env.get('BLOG_SERVICE_TOKEN') or env.get('EPHRON_SERVICE_TOKEN')
- base_url = env.get('BLOG_API_BASE_URL', 'https://blog.ephron.ren')
- if not token:
- print('缺少 blog service token,已停止。')
- sys.exit(1)
-
- errors = []
- source_counts = {}
- raw_items = []
-
- # ── Collect raw items (unchanged) ────────────────────────────────────────
- try:
- aihot_items, raw_daily = parse_aihot(TODAY)
- raw_items.extend(aihot_items)
- source_counts['AI HOT'] = len(aihot_items)
- except urllib.error.HTTPError as e:
- if e.code == 404:
- print(f'今天({TODAY})的 AI HOT 完整日报还没有生成,暂不发布。')
- return
- raise
-
- for name, url in RSS_FEEDS.items():
- try:
- parsed = parse_rss(name, url)
- raw_items.extend(parsed)
- source_counts[name] = len(parsed)
- except Exception as e:
- errors.append(f'{name}: {type(e).__name__}')
- source_counts[name] = 0
-
- juya_items = []
- try:
- juya_items = parse_juya(TODAY)
- except Exception as e:
- errors.append(f'橘鸦AI早报: {type(e).__name__}')
-
- # If juya returned nothing, wait 2 minutes and retry once
- if not juya_items:
- print('橘鸦AI早报尚未就绪,等待 2 分钟后重试...')
- time.sleep(120)
- try:
- juya_items = parse_juya(TODAY)
- except Exception as e:
- errors.append(f'橘鸦AI早报(重试): {type(e).__name__}')
-
- raw_items.extend(juya_items)
- source_counts['橘鸦AI早报'] = len(juya_items)
-
- raw_path = OUT_DIR / 'raw_items.json'
- raw_path.write_text(json.dumps(raw_items, ensure_ascii=False, indent=2), encoding='utf-8')
-
- # ── Stage 0: Script dedup ────────────────────────────────────────────────
- print(f'Stage 0: Script dedup — {len(raw_items)} raw items')
- items = stage0_script_dedup(raw_items)
- stage0_count = len(items)
- print(f'Stage 0 done — {stage0_count} unique items')
-
- # ── Stage 1: LLM semantic dedup ─────────────────────────────────────────
- print(f'Stage 1: LLM semantic dedup')
- items, stage1_err = stage1_llm_dedup(items, env)
- if stage1_err:
- errors.append(stage1_err)
- print(f'Stage 1 done — {len(items)} items')
-
- # ── Stage 2: Parallel summary rewrite + classify ────────────────────────
- print(f'Stage 2: Parallel summary rewrite + classify')
- items, stage2_errs = stage2_parallel(items, env)
- errors.extend(stage2_errs)
- print(f'Stage 2 done — {len(items)} items')
-
- # ── Build final items with title/source fields ──────────────────────────
- # At this point items still have raw fields; convert to final format
- final_items = []
- seen_titles = set()
- for item in items:
- title = clean_text(item.get('title_raw', ''))
- summary = clean_text(item.get('summary_raw', ''))[:120]
- if not title:
- continue
- norm = _normalize_title(title)
- if norm in seen_titles:
- continue
- seen_titles.add(norm)
- section = item.get('section_hint', '') or '行业与公司'
- if section not in SECTION_ORDER:
- section = '行业与公司'
- final_items.append({
- 'title': title,
- 'summary': summary or '该条目暂无摘要。',
- 'section': section,
- 'url': item.get('url') or '',
- 'source': item.get('source_label') or item.get('source_group') or '来源',
- 'source_group': item.get('source_group') or '未知来源',
- 'dedupe_keys': [norm],
- })
-
- # ── Stage 3: LLM guide/observation ──────────────────────────────────────
- print(f'Stage 3: LLM guide generation')
- guide_text = llm_generate_guide(final_items, TODAY, env)
-
- # Parse guide into structured format for blog_markdown
- guide_structured = []
- if guide_text:
- parsed = _parse_guide_sections(guide_text)
- type_map = {'主线': 'theme', '强信号': 'strong', '中信号': 'medium', '待验证': 'risk'}
- for key, text in parsed.items():
- guide_type = type_map.get(key, 'theme')
- if guide_type == 'theme':
- guide_structured.append({'type': 'theme', 'text': text})
- else:
- # Split into individual items by numbered lines
- lines = [l.strip() for l in text.split('\n') if l.strip()]
- for line in lines:
- # Remove leading number like "1. "
- line = re.sub(r'^\d+[\.\、]\s*', '', line)
- if line:
- guide_structured.append({'type': guide_type, 'text': line})
-
- # ── Stage 4: Assemble and publish ───────────────────────────────────────
- print(f'Stage 4: Assemble and publish')
- md = blog_markdown(final_items, guide_structured)
- title = f'AI日报 · {TODAY}'
- tags = ['AI日报', 'AI资讯', '人工智能']
- payload = {'title': title, 'content': md, 'tags': tags}
-
- dry_run = (env.get('AI_DAILY_DRY_RUN') or '').strip().lower() in ('1', 'true', 'yes')
- if dry_run:
- slug = f'dry-run-{TODAY}'
- blog_url = f'{base_url}/posts/{slug}'
- public_ok = True
- print('AI_DAILY_DRY_RUN=1:已完成组装验证,跳过博客创建/发布。')
- else:
- create_resp = blog_api_request('POST', '/api/service/posts', payload=payload, token=token, base_url=base_url)
- slug = create_resp.get('slug')
- if not slug:
- print('Blog 草稿创建失败:未返回 slug')
- sys.exit(1)
- blog_api_request('POST', f'/api/service/posts/{slug}/publish', token=token, base_url=base_url)
- blog_url = f'{base_url}/posts/{slug}'
-
- public_ok = False
- try:
- req = urllib.request.Request(blog_url, headers={'User-Agent': UA})
- with urllib.request.urlopen(req, timeout=20) as r:
- public_ok = getattr(r, 'status', None) == 200
- except Exception:
- public_ok = False
-
- msg = short_summary(blog_url)
- if errors:
- msg += '\n\n注:部分补充源本次采集失败或LLM阶段出错,已自动降级:' + ';'.join(errors)
- if not public_ok:
- msg += '\n\n警告:blog 草稿/发布接口已返回成功,但公开链接暂未验证为 200,请人工复核。'
-
- # Build digest for JSON output
- digest = {
- 'items': final_items,
- 'featured_titles': [i['title'] for i in final_items[:6]],
- 'guide': guide_structured,
- }
-
- (OUT_DIR / 'llm_digest.json').write_text(json.dumps(digest, ensure_ascii=False, indent=2), encoding='utf-8')
- (OUT_DIR / 'blog_markdown.md').write_text(md, encoding='utf-8')
- (OUT_DIR / 'chat_summary.txt').write_text(msg, encoding='utf-8')
- (OUT_DIR / 'run_meta.json').write_text(json.dumps({
- 'date': TODAY,
- 'slug': slug,
- 'blog_url': blog_url,
- 'public_ok': public_ok,
- 'errors': errors,
- 'aihot_sections': [s.get('label') for s in raw_daily.get('sections', [])],
- 'raw_item_count': len(raw_items),
- 'stage0_count': stage0_count,
- 'final_item_count': len(final_items),
- 'has_juya': any(i.get('source_group') == '橘鸦AI早报' for i in raw_items),
- 'source_counts': source_counts,
- 'featured_titles': digest.get('featured_titles', []),
- }, ensure_ascii=False, indent=2), encoding='utf-8')
-
- print(msg)
+ dry_run = is_dry_run(env)
+ run_daily_report(
+ run_date=env.get("AI_DAILY_RUN_DATE") or "today",
+ mode="dry-run" if dry_run else env.get("AI_DAILY_MODE", "publish"),
+ source_mode=env.get("AI_DAILY_SOURCE_MODE", "live"),
+ llm_mode=env.get("AI_DAILY_LLM_MODE", "live"),
+ out_dir=Path(env.get("AI_DAILY_OUT_DIR") or OUT_DIR),
+ base_url=env.get("BLOG_API_BASE_URL", "https://blog.ephron.ren"),
+ sources_path=Path(env["AI_DAILY_SOURCES_PATH"]) if env.get("AI_DAILY_SOURCES_PATH") else None,
+ env=env,
+ )
-if __name__ == '__main__':
+if __name__ == "__main__":
main()
diff --git a/script/blog_markdown.md b/script/blog_markdown.md
deleted file mode 100644
index 2d77cda..0000000
--- a/script/blog_markdown.md
+++ /dev/null
@@ -1,198 +0,0 @@
-## 导览
-
-> > 微软与OpenAI正式分家、Anthropic提交招股书、DeepSeek计划融500亿——AI行业正在从“联盟军”转向“诸侯争霸”。
-
-## 模型发布/更新
-
-**1. Grok Imagine 1.5 预览版发布**
-
-> Grok Imagine 1.5 预览版即日起在 API 中上线,SpaceXAI 持续发力。[X:@cb_doge ↗](https://x.com/cb_doge/status/2062242490745594085)
-
-**2. MiniMax M3 1M token 解码加速 15.6 倍**
-
-> MiniMax M3 在 1M token 下解码加速 15.6 倍,FireworksAI_HQ 提供推理支持。[X:@MiniMax_AI ↗](https://x.com/MiniMax_AI/status/2062316914618388758)
-
-**3. Miso One 开源语音模型:8B 参数、110ms 延迟、一次语音克隆**
-
-> Miso One 发布 8B 参数开源语音模型,支持一次语音克隆(短样本),推理延迟 110ms,权重已开源,可自托管,API 即将推出,演示已上线。[X:@kimmonismus ↗](https://x.com/kimmonismus/status/2062210845308780639)
-
-**4. Ideogram v4.0 发布:2K 分辨率和 JSON 提示支持**
-
-> Ideogram v4.0 发布,原生 2K 分辨率,文字渲染出色,支持 JSON 提示词,可在 Krea 中体验。[X:@krea_ai ↗](https://x.com/krea_ai/status/2062227837130887567)
-
-## 产品与工具
-
-**5. Meta 面向 WhatsApp Business 的 AI 智能体现已全球上线**
-
-> Meta 为 WhatsApp Business 推出的 AI 智能体面向全球商家开放,按模型 token 使用量收费。[TechCrunch ↗](https://techcrunch.com/2026/06/03/metas-ai-agent-for-whatsapp-business-is-now-available-globally)
-
-**6. NousResearch 发布 Hermes Agent 桌面应用公测版**
-
-> NousResearch 推出 Hermes Agent 桌面应用公测版。[X:@SiliconFlowAI ↗](https://x.com/SiliconFlowAI/status/2062042813852995899)
-
-**7. xAI Grok 语音模型上线 Vapi 平台**
-
-> xAI 的 Grok STT 和 TTS 语音模型登陆企业语音 AI 平台 Vapi,可用于构建自定义语音智能体。[X:@xai ↗](https://x.com/xai/status/2062209374039499178)
-
-**8. Grok 模型登陆 Cloudflare AI Gateway**
-
-> Grok 模型现已可在 Cloudflare AI Gateway 上试用。[X:@xai ↗](https://x.com/xai/status/2062294202625696081)
-
-**9. OpenShell v0.0.55 发布:新增 Vertex AI 推理支持**
-
-> OpenShell v0.0.55 发布,新增 Google Vertex AI 推理支持,改进策略可见性、Podman 检测和 GPU 沙箱行为。[X:@NVIDIAAI ↗](https://x.com/NVIDIAAI/status/2062210034109677665)
-
-**10. Replit 上线 SEO Agent 助应用被发现**
-
-> Replit 推出 SEO Agent,扫描应用并提供修复建议,帮助应用在网页和 AI 搜索中被发现。[X:@Replit ↗](https://x.com/Replit/status/2062211976995188871)
-
-**11. OpenClaw 2026.6.1 发布:新增 Windows 节点与技能工坊**
-
-> OpenClaw 2026.6.1 发布,新增原生 Windows 节点主机、技能工坊和工作板编排,支持 MiniMax M3。[X:@openclaw ↗](https://x.com/openclaw/status/2062288421406785710)
-
-**12. Reachy Mini 添加 MCP 工具**
-
-> Reachy Mini 推出公开 MCP canary Space,支持远程工具调用。[Hugging Face:Blog ↗](https://huggingface.co/blog/adding-mcp-tools-to-reachy-mini)
-
-**13. 刚刚,Meta Skill 来了**
-
-> GitHub 热门仓库 OpenSquilla 发布,代表 Meta Skill 新动向。[量子位 ↗](https://www.qbitai.com/2026/06/428335.html)
-
-## 开发与工程
-
-**14. Qwen Cloud 全球 AI 黑客马拉松启动**
-
-> 首届 Qwen Cloud 全球 AI 黑客马拉松启动,5 大赛道,总奖金超 7 万美元(赛道冠军 1 万美元),Devpost 报名。[X:@alibaba_cloud ↗](https://x.com/alibaba_cloud/status/2062113338994172169)
-
-**15. 洪水韧性新篇章:Google 开源水文建模框架**
-
-> Google Research 开源基于 PyTorch 的水文建模框架,采用 Flood Hub 相同架构,允许各国气象部门在本地训练 AI 洪水预报模型。[Google Research:Blog ↗](https://research.google/blog/the-next-chapter-in-flood-resilience-open-sourcing-googles-hydrology-framework)
-
-**16. 文章:导致 Spark 在 Kubernetes 上 OOM 失败的两个错误配置**
-
-> 迁移 Spark 到 AKS 后,两个配置交互导致 OOM:spark.kubernetes.local.dirs.tmpfs 使 shuffle spill 改用 RAM 而非磁盘。[InfoQ AI ↗](https://www.infoq.com/articles/spark-oom-kubernetes-misconfigurations/?utm_campaign=infoq_content&utm_source=infoq&utm_medium=feed&utm_term=AI%2C+ML+%26+Data+Engineering)
-
-## 行业与公司
-
-**17. 微软与 OpenAI 分道扬镳——如今双方准备正面交锋**
-
-> 微软与 OpenAI 合作关系破裂,进入直接竞争。微软 AI 主管 Mustafa Suleyman 称微软需独立证明能力。[The Verge ↗](https://www.theverge.com/ai-artificial-intelligence/942242/microsoft-build-ai-agents-openai-competition)
-
-**18. 欧盟公布全面技术主权计划,推动芯片与 AI 自主发展**
-
-> 欧盟推出技术主权计划,扩大本土半导体、AI 和云计算供应链,减少对美亚依赖。[Bloomberg ↗](https://www.bloomberg.com/news/articles/2026-06-03/europe-unveils-sweeping-tech-sovereignty-plan-to-boost-chips-ai)
-
-**19. Sensor Tower:OpenAI 旗下 ChatGPT 月活已破 10 亿,史上最快**
-
-> Sensor Tower 估计 ChatGPT 月活于 2025 年 5 月突破 10 亿,增速史上最快;Claude 月活 5600 万,同比增 640%。[IT之家 ↗](https://www.ithome.com/0/959/083.htm)
-
-**20. 消息称 DeepSeek 首轮融资拟筹集 500 亿元,腾讯、宁德时代等参投**
-
-> DeepSeek 首轮拟融资 500 亿元,投后估值 3500-4000 亿元。创始人梁文峰出资 200 亿,腾讯拟投 100 亿,宁德时代 50 亿。[IT之家 ↗](https://www.ithome.com/0/959/249.htm)
-
-**21. Suno 完成 4 亿美元 D 轮融资**
-
-> Suno 完成 4 亿美元 D 轮融资,估值 54 亿美元,致力于让更多人体验音乐制作。[X:@suno ↗](https://x.com/suno/status/2062183524887675243)
-
-**22. 宏利香港与阿里云达成 AI 战略合作**
-
-> 宏利香港与阿里云建立战略合作,共建负责任 AI 创新框架,加速 AI 部署。[X:@alibaba_cloud ↗](https://x.com/alibaba_cloud/status/2062006591377829922)
-
-**23. 优步每月 1,500 美元的 AI 使用上限为 AI 工具定价提供参考**
-
-> 优步将 AI 工具月使用上限设为 1500 美元,为行业 AI 定价提供参考信号。[Simon Willison ↗](https://simonwillison.net/2026/Jun/3/uber-caps-usage)
-
-**24. 世界模型榜首易主!跨维智能登顶 WorldArena**
-
-> 跨维智能在 WorldArena 上登顶,成为世界模型新榜首。[量子位 ↗](https://www.qbitai.com/2026/06/428435.html)
-
-**25. 刚刚,Anthropic 提交了招股书!**
-
-> Anthropic 已提交招股书,预计最快 Q4 上市。[量子位 ↗](https://www.qbitai.com/2026/06/428407.html)
-
-## 论文与研究
-
-**26. 斯坦福大学法学院研究:人工智能的表现优于法学教授**
-
-> 斯坦福大学法学院研究显示,AI 表现优于法学教授,该结果在 Hacker News 获 104 个 Points。[law.stanford.edu ↗](https://law.stanford.edu/press/ai-outperforms-law-professors-in-stanford-law-study)
-
-**27. NVIDIA Research 在 CVPR 2026 发表三篇论文:规模化训练实现抓取、自动驾驶与智能体泛化**
-
-> NVIDIA Research 在 CVPR 2026 发表三篇论文:零样本抓取模型 GraspGen-X、自动驾驶 LCDrive、具身智能体 NitroGen,均基于大规模训练。[blogs.nvidia.com:Blog ↗](https://blogs.nvidia.com/blog/cvpr-research-grasping-driving-agent-training)
-
-**28. Anthropic 分析 832 个 AI 恶意账户:中高风险攻击者半年从 33% 跃至 56%**
-
-> Anthropic 分析 832 个被封恶意账户,67.3% 使用 AI 编写恶意软件,中高风险占比半年内从 33% 升至 56%,传统威胁评估失效。[Anthropic ↗](https://www.anthropic.com/news/AI-enabled-cyber-threats-mitre-attack)
-
-**29. 微软研究:装瓶厂 AI 从聊天到决策**
-
-> 微软在中西部装瓶厂试点三个月显示,AI 超越聊天进入决策领域,需应对真实风险和可靠性要求。[X:@MSFTResearch ↗](https://x.com/MSFTResearch/status/2062204914223169635)
-
-**30. 世界模型的功能分类**
-
-> World Labs 与李飞飞发文梳理“世界模型”概念,基于 POMDP 框架分类,指出当前所谓世界模型本质是同一循环的不同投影(如渲染器)。[X:@drfeifei ↗](https://x.com/drfeifei/status/2062247238143996275)
-
-**31. 从看懂世界到做对动作,卧安机器人 OneModel 1.7 用一条「隐式通路」打通了具身智能的关键断层**
-
-> 卧安机器人 OneModel 1.7 通过隐式通路在潜在空间完成信息传导,打通具身智能关键断层。[量子位 ↗](https://www.qbitai.com/2026/06/428703.html)
-
-## 人物与花絮
-
-**32. 黄仁勋与纳德拉共议智能体 AI 时代**
-
-> 黄仁勋与纳德拉在台北 MSBuild 同台,展示 NVIDIA 与微软从 Windows 到 AI 工厂的协作。[X:@nvidia ↗](https://x.com/nvidia/status/2062228974273716457)
-
-**33. Satya Nadella 谈微软 Build 大会主旨演讲**
-
-> Satya Nadella 在 Microsoft Build 主旨演讲,强调共同构建前沿智能生态系统。[X:@satyanadella ↗](https://x.com/satyanadella/status/2062022060176801826)
-
-**34. Karpathy 的 llm-wiki 项目获超五千星**
-
-> @karpathy 的 llm-wiki 项目几周内获 5000+ 星,理念是让 LLM 构建并维护可持续进化的维基知识库。[X:@SiliconFlowAI ↗](https://x.com/SiliconFlowAI/status/2062054848762450324)
-
-## 观点与教程
-
-**35. 智能体工程实战窍门全录**
-
-> @mvanhorn 分享智能体工程方法论:人主导方向、智能体执行,核心为 plan.md 约束行为,总结 22 条实战技巧及完整工具栈。[X:@shao__meng ↗](https://x.com/shao__meng/status/2061974983094755575)
-
-**36. Anthropic 用 Claude 赋能自助数据分析**
-
-> Anthropic 用 Claude 自动化 95% 业务分析查询,准确率约 95%,通过智能体分析栈解决概念-实体歧义等三大错误来源。[Claude:Blog ↗](https://claude.com/blog/how-anthropic-enables-self-service-data-analytics-with-claude)
-
-**37. 超越聊天机器人的直接偏好优化**
-
-> Dharma-AI 在 Hugging Face 博客发文,探讨直接偏好优化(DPO)在聊天机器人之外的广泛应用。[Hugging Face:Blog ↗](https://huggingface.co/blog/Dharma-AI/direct-preference-optimization-beyond-chatbots)
-
-**38. 演讲:选择你的 AI 副驾驶:最大化开发效率**
-
-> Sepehr Khosravi 探讨开发效率工具演变,评估 Cursor 和 Claude Code 等优势,为高级工程师提供可行技巧。[InfoQ AI ↗](https://www.infoq.com/presentations/choosing-ai-copilot/?utm_campaign=infoq_content&utm_source=infoq&utm_medium=feed&utm_term=AI%2C+ML+%26+Data+Engineering)
-
-## 总结
-
-**强信号**
-
-- **微软与OpenAl分道扬镳,双方开始正面竞争**
- 合作终结后,微软AI主管Mustafa Suleyman称公司必须独立证明能力,这意味着微软将不再依赖OpenAI的模型,而是全力押注自研,OpenAI也失去最大云盟友。
-
-- **Anthropic提交招股书,预计最快Q4上市**
- 这标志着安全派AI公司正式进入资本市场,与OpenAI争夺投资者注意,Claude的月活同比增长640%也为其估值提供了底气。
-
-- **ChatGPT月活突破10亿,成为史上增长最快的应用**
- Sensor Tower数据显示ChatGPT在2025年5月达到这一里程碑,Claude月活5600万,两家头部消费级AI应用的用户粘性正在拉开差距。
-
-**中信号**
-
-- **Miso One发布8B开源语音模型,支持一次语音克隆且延迟仅110ms**
- 权重已开放、可自托管,意味着实时语音克隆的门槛从专有API降到了个人部署,可能加速语音交互在开发者中的普及。
-
-- **欧盟公布全面技术主权计划,推动芯片与AI自主发展**
- 计划扩大本土半导体、AI和云计算供应链,目标减少对美亚依赖——这将对全球AI公司的合规、市场准入和数据主权产生实质影响。
-
-**待验证**
-
-- **DeepSeek首轮融资拟筹500亿元,腾讯、宁德时代参投**
- 投后估值高达3500-4000亿元,但融资消息来源为IT之家,未见官方确认。如此大体量的AI融资在国内市场是否顺利落地,存在不确定性。
-
-- **跨维智能登顶WorldArena世界模型榜首**
- WorldArena的评测权威性尚未被广泛验证,且“世界模型”概念本身缺乏统一标准,需要看后续是否有独立第三方复现其能力。
\ No newline at end of file
diff --git a/script/run_meta.json b/script/run_meta.json
deleted file mode 100644
index eba646f..0000000
--- a/script/run_meta.json
+++ /dev/null
@@ -1,35 +0,0 @@
-{
- "date": "2026-06-04",
- "slug": "ai-2026-06-04",
- "blog_url": "https://blog.ephron.ren/posts/ai-2026-06-04",
- "public_ok": true,
- "errors": [
- "橘鸦AI早报(重试): TimeoutError"
- ],
- "aihot_sections": [
- "模型发布/更新",
- "产品发布/更新",
- "行业动态",
- "论文研究",
- "技巧与观点"
- ],
- "raw_item_count": 39,
- "stage0_count": 39,
- "final_item_count": 38,
- "has_juya": false,
- "source_counts": {
- "AI HOT": 32,
- "InfoQ AI": 2,
- "MIT科技评论AI": 0,
- "量子位": 5,
- "橘鸦AI早报": 0
- },
- "featured_titles": [
- "Grok Imagine 1.5 预览版发布",
- "MiniMax M3 1M token 解码加速 15.6 倍",
- "Miso One 开源语音模型:8B 参数、110ms 延迟、一次语音克隆",
- "Ideogram v4.0 发布:2K 分辨率和 JSON 提示支持",
- "Meta 面向 WhatsApp Business 的 AI 智能体现已全球上线",
- "NousResearch 发布 Hermes Agent 桌面应用公测版"
- ]
-}
\ No newline at end of file
diff --git a/skill/scripts/.gitkeep b/skill/scripts/.gitkeep
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/skill/scripts/.gitkeep
@@ -0,0 +1 @@
+
diff --git a/skill/scripts/run_daily_report.py b/skill/scripts/run_daily_report.py
new file mode 100644
index 0000000..033bcda
--- /dev/null
+++ b/skill/scripts/run_daily_report.py
@@ -0,0 +1,7 @@
+#!/usr/bin/env python3
+from ai_daily_report.cli import main
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
+
diff --git a/tests/fixtures/.gitkeep b/tests/fixtures/.gitkeep
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/tests/fixtures/.gitkeep
@@ -0,0 +1 @@
+
diff --git a/tests/test_cli.py b/tests/test_cli.py
new file mode 100644
index 0000000..3372679
--- /dev/null
+++ b/tests/test_cli.py
@@ -0,0 +1,47 @@
+import unittest
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+from ai_daily_report.cli import build_parser, main
+
+
+class CliTests(unittest.TestCase):
+ def test_run_command_parses_date_and_mode(self):
+ parser = build_parser()
+
+ args = parser.parse_args(["run", "--date", "2026-06-04", "--mode", "dry-run", "--source-mode", "live", "--llm-mode", "live", "--sources-path", "config/sources.json"])
+
+ self.assertEqual(args.command, "run")
+ self.assertEqual(args.date, "2026-06-04")
+ self.assertEqual(args.mode, "dry-run")
+ self.assertEqual(args.source_mode, "live")
+ self.assertEqual(args.llm_mode, "live")
+ self.assertEqual(args.sources_path, "config/sources.json")
+
+ def test_main_returns_zero_for_parseable_command(self):
+ self.assertEqual(main(["run", "--date", "2026-06-04", "--mode", "dry-run"]), 0)
+
+ def test_main_mock_run_writes_outputs(self):
+ with TemporaryDirectory() as temp_dir:
+ exit_code = main(
+ [
+ "run",
+ "--date",
+ "2026-06-04",
+ "--mode",
+ "dry-run",
+ "--source-mode",
+ "mock",
+ "--llm-mode",
+ "mock",
+ "--out-dir",
+ temp_dir,
+ ]
+ )
+
+ self.assertEqual(exit_code, 0)
+ self.assertTrue((Path(temp_dir) / "2026-06-04" / "blog_markdown.md").exists())
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_clients.py b/tests/test_clients.py
new file mode 100644
index 0000000..ccf9e9d
--- /dev/null
+++ b/tests/test_clients.py
@@ -0,0 +1,47 @@
+import json
+import unittest
+from unittest.mock import patch
+
+from ai_daily_report.clients import BlogApiClient, OpenAICompatibleClient, fetch_text
+
+
+class FakeResponse:
+ status = 200
+
+ def __init__(self, body):
+ self.body = body
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc, tb):
+ return False
+
+ def read(self):
+ return self.body
+
+
+class ClientTests(unittest.TestCase):
+ def test_fetch_text_decodes_response(self):
+ with patch("urllib.request.urlopen", return_value=FakeResponse("ok".encode("utf-8"))):
+ self.assertEqual(fetch_text("https://example.com", 1), "ok")
+
+ def test_openai_compatible_client_returns_message_content(self):
+ body = json.dumps({"choices": [{"message": {"content": "hello"}}]}).encode("utf-8")
+ with patch("urllib.request.urlopen", return_value=FakeResponse(body)):
+ client = OpenAICompatibleClient(api_key="key", base_url="https://llm.example/v1", model="model")
+ self.assertEqual(client.chat("prompt"), "hello")
+
+ def test_blog_api_client_create_and_publish(self):
+ responses = [
+ FakeResponse(json.dumps({"slug": "ai-2026-06-04"}).encode("utf-8")),
+ FakeResponse(json.dumps({"ok": True}).encode("utf-8")),
+ ]
+ with patch("urllib.request.urlopen", side_effect=responses):
+ client = BlogApiClient(base_url="https://blog.example", token="token")
+ self.assertEqual(client.create_post({"title": "t"})["slug"], "ai-2026-06-04")
+ client.publish_post("ai-2026-06-04")
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_config_loading.py b/tests/test_config_loading.py
new file mode 100644
index 0000000..cf80a19
--- /dev/null
+++ b/tests/test_config_loading.py
@@ -0,0 +1,27 @@
+import unittest
+from pathlib import Path
+
+from ai_daily_report.config import load_source_configs
+from ai_daily_report.sources.registry import get_source_fetcher
+
+
+ROOT = Path(__file__).resolve().parents[1]
+
+
+class ConfigLoadingTests(unittest.TestCase):
+ def test_load_source_configs_from_json(self):
+ configs = load_source_configs(ROOT / "config" / "sources.json")
+
+ self.assertGreaterEqual(len(configs), 5)
+ self.assertEqual(configs[0].name, "AI HOT")
+ self.assertEqual(configs[0].type, "aihot")
+
+ def test_all_configured_source_types_are_registered(self):
+ configs = load_source_configs(ROOT / "config" / "sources.json")
+
+ for config in configs:
+ self.assertTrue(callable(get_source_fetcher(config.type)))
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_dry_run_config.py b/tests/test_dry_run_config.py
new file mode 100644
index 0000000..bc32cd6
--- /dev/null
+++ b/tests/test_dry_run_config.py
@@ -0,0 +1,33 @@
+import importlib.util
+import unittest
+from pathlib import Path
+
+
+ROOT = Path(__file__).resolve().parents[1]
+SCRIPT = ROOT / "script" / "ai_daily_blog_pipeline.py"
+
+
+def load_pipeline_module():
+ spec = importlib.util.spec_from_file_location("ai_daily_blog_pipeline", SCRIPT)
+ module = importlib.util.module_from_spec(spec)
+ spec.loader.exec_module(module)
+ return module
+
+
+class DryRunConfigTests(unittest.TestCase):
+ def test_dry_run_does_not_require_blog_token(self):
+ module = load_pipeline_module()
+
+ self.assertTrue(module.is_dry_run({"AI_DAILY_DRY_RUN": "1"}))
+ self.assertFalse(module.requires_blog_token({"AI_DAILY_DRY_RUN": "1"}))
+
+ def test_publish_mode_requires_blog_token(self):
+ module = load_pipeline_module()
+
+ self.assertFalse(module.is_dry_run({}))
+ self.assertTrue(module.requires_blog_token({}))
+
+
+if __name__ == "__main__":
+ unittest.main()
+
diff --git a/tests/test_env_config.py b/tests/test_env_config.py
new file mode 100644
index 0000000..cc452f6
--- /dev/null
+++ b/tests/test_env_config.py
@@ -0,0 +1,87 @@
+import unittest
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+from ai_daily_report.env import resolve_blog_token, resolve_llm_config
+
+
+class EnvConfigTests(unittest.TestCase):
+ def test_resolve_llm_config_prefers_generic_values(self):
+ config = resolve_llm_config(
+ {
+ "LLM_API_KEY": "generic-key",
+ "LLM_BASE_URL": "https://generic.example/v1",
+ "LLM_MODEL": "generic-model",
+ "SUB2API_API_KEY": "sub-key",
+ "SUB2API_BASE_URL": "https://sub.example/v1",
+ "SUB2API_MODEL": "sub-model",
+ }
+ )
+
+ self.assertEqual(
+ config,
+ {
+ "api_key": "generic-key",
+ "base_url": "https://generic.example/v1",
+ "model": "generic-model",
+ },
+ )
+
+ def test_resolve_llm_config_reports_missing_fields(self):
+ with self.assertRaisesRegex(ValueError, "missing_llm_config: LLM_BASE_URL,LLM_MODEL"):
+ resolve_llm_config({"LLM_API_KEY": "key"})
+
+ def test_resolve_llm_config_follows_hermes_provider_config(self):
+ with TemporaryDirectory() as temp_dir:
+ hermes_dir = Path(temp_dir)
+ (hermes_dir / "config.yaml").write_text(
+ """
+model:
+ provider: sub2api
+ default: findmini/gpt-5.5
+ base_url: http://sub2api.example/v1
+""".strip(),
+ encoding="utf-8",
+ )
+ (hermes_dir / ".env").write_text("SUB2API_API_KEY=hermes-key\n", encoding="utf-8")
+
+ config = resolve_llm_config({}, hermes_dir=hermes_dir)
+
+ self.assertEqual(
+ config,
+ {
+ "api_key": "hermes-key",
+ "base_url": "http://sub2api.example/v1",
+ "model": "findmini/gpt-5.5",
+ },
+ )
+
+ def test_resolve_llm_config_uses_hermes_auth_json_env_source(self):
+ with TemporaryDirectory() as temp_dir:
+ hermes_dir = Path(temp_dir)
+ (hermes_dir / "config.yaml").write_text(
+ """
+model:
+ provider: sub2api
+ default: findmini/gpt-5.5
+ base_url: http://sub2api.example/v1
+""".strip(),
+ encoding="utf-8",
+ )
+ (hermes_dir / "auth.json").write_text(
+ '{"credential_pool": {"sub2api": [{"source": "env:SUB2API_API_KEY"}]}}',
+ encoding="utf-8",
+ )
+
+ config = resolve_llm_config({"SUB2API_API_KEY": "auth-env-key"}, hermes_dir=hermes_dir)
+
+ self.assertEqual(config["api_key"], "auth-env-key")
+ self.assertEqual(config["base_url"], "http://sub2api.example/v1")
+ self.assertEqual(config["model"], "findmini/gpt-5.5")
+
+ def test_resolve_blog_token_uses_supported_names(self):
+ self.assertEqual(resolve_blog_token({"EPHRON_SERVICE_TOKEN": "token"}), "token")
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_env_loading.py b/tests/test_env_loading.py
new file mode 100644
index 0000000..38d28f8
--- /dev/null
+++ b/tests/test_env_loading.py
@@ -0,0 +1,39 @@
+import importlib.util
+import os
+import unittest
+from pathlib import Path
+from unittest.mock import patch
+
+
+ROOT = Path(__file__).resolve().parents[1]
+SCRIPT = ROOT / "script" / "ai_daily_blog_pipeline.py"
+
+
+def load_pipeline_module():
+ spec = importlib.util.spec_from_file_location("ai_daily_blog_pipeline", SCRIPT)
+ module = importlib.util.module_from_spec(spec)
+ spec.loader.exec_module(module)
+ return module
+
+
+class EnvLoadingTests(unittest.TestCase):
+ def test_project_env_is_loaded_and_process_env_wins(self):
+ module = load_pipeline_module()
+ env_text = "LLM_MODEL=file-model\nLLM_BASE_URL=https://file.example/v1\n"
+
+ with patch.object(module.Path, "home", return_value=ROOT / "missing-home"):
+ with patch.dict(os.environ, {"LLM_MODEL": "process-model"}, clear=False):
+ with patch.object(module, "PROJECT_ENV_PATH", ROOT / ".env.test"):
+ (ROOT / ".env.test").write_text(env_text, encoding="utf-8")
+ try:
+ env = module.load_env()
+ finally:
+ (ROOT / ".env.test").unlink(missing_ok=True)
+
+ self.assertEqual(env["LLM_BASE_URL"], "https://file.example/v1")
+ self.assertEqual(env["LLM_MODEL"], "process-model")
+
+
+if __name__ == "__main__":
+ unittest.main()
+
diff --git a/tests/test_legacy_script_delegation.py b/tests/test_legacy_script_delegation.py
new file mode 100644
index 0000000..7c24e61
--- /dev/null
+++ b/tests/test_legacy_script_delegation.py
@@ -0,0 +1,57 @@
+import importlib.util
+import unittest
+from pathlib import Path
+from unittest.mock import patch
+
+
+ROOT = Path(__file__).resolve().parents[1]
+SCRIPT = ROOT / "script" / "ai_daily_blog_pipeline.py"
+
+
+def load_pipeline_module():
+ spec = importlib.util.spec_from_file_location("ai_daily_blog_pipeline", SCRIPT)
+ module = importlib.util.module_from_spec(spec)
+ spec.loader.exec_module(module)
+ return module
+
+
+class LegacyScriptDelegationTests(unittest.TestCase):
+ def test_main_delegates_to_new_pipeline_by_default(self):
+ module = load_pipeline_module()
+ calls = []
+
+ def fake_run_daily_report(**kwargs):
+ calls.append(kwargs)
+ return {"reports": {"stage8": {"status": "ok"}}}
+
+ with patch.object(module, "load_env", return_value={"AI_DAILY_DRY_RUN": "1"}):
+ with patch("ai_daily_report.runner.run_daily_report", side_effect=fake_run_daily_report):
+ module.main()
+
+ self.assertEqual(len(calls), 1)
+ self.assertEqual(calls[0]["mode"], "dry-run")
+ self.assertEqual(calls[0]["source_mode"], "live")
+ self.assertEqual(calls[0]["llm_mode"], "live")
+
+ def test_main_allows_mock_modes_for_local_test(self):
+ module = load_pipeline_module()
+ calls = []
+
+ def fake_run_daily_report(**kwargs):
+ calls.append(kwargs)
+ return {"reports": {"stage8": {"status": "ok"}}}
+
+ with patch.object(
+ module,
+ "load_env",
+ return_value={"AI_DAILY_DRY_RUN": "1", "AI_DAILY_SOURCE_MODE": "mock", "AI_DAILY_LLM_MODE": "mock"},
+ ):
+ with patch("ai_daily_report.runner.run_daily_report", side_effect=fake_run_daily_report):
+ module.main()
+
+ self.assertEqual(calls[0]["source_mode"], "mock")
+ self.assertEqual(calls[0]["llm_mode"], "mock")
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_llm_utils.py b/tests/test_llm_utils.py
new file mode 100644
index 0000000..fa73cd3
--- /dev/null
+++ b/tests/test_llm_utils.py
@@ -0,0 +1,17 @@
+import unittest
+
+from ai_daily_report.llm import parse_json_object
+
+
+class LlmUtilsTests(unittest.TestCase):
+ def test_parse_json_object_strips_markdown_fence(self):
+ self.assertEqual(parse_json_object('```json\n{"ok": true}\n```'), {"ok": True})
+
+ def test_parse_json_object_raises_without_json(self):
+ with self.assertRaises(ValueError):
+ parse_json_object("not json")
+
+
+if __name__ == "__main__":
+ unittest.main()
+
diff --git a/tests/test_markdown_rendering.py b/tests/test_markdown_rendering.py
new file mode 100644
index 0000000..205f379
--- /dev/null
+++ b/tests/test_markdown_rendering.py
@@ -0,0 +1,39 @@
+import unittest
+
+from ai_daily_report.assemble import assemble_markdown
+from ai_daily_report.models import NewsItem
+
+
+class MarkdownRenderingTests(unittest.TestCase):
+ def test_blog_markdown_strips_double_blockquote_and_reference_markers(self):
+ items = [
+ NewsItem(
+ id="a",
+ source_group="AI HOT",
+ source_label="OpenAI:Blog",
+ source_role="primary",
+ source_priority=10,
+ title_raw="测试模型发布",
+ title_norm="测试模型发布",
+ summary_raw="测试摘要",
+ title="测试模型发布",
+ summary="测试摘要",
+ url="https://openai.com/blog/test",
+ canonical_url="https://openai.com/blog/test",
+ section="模型与能力",
+ )
+ ]
+ guide = {"theme": "> 主线判断:测试主线[1]", "threads": []}
+
+ md, _ = assemble_markdown(items, guide)
+
+ self.assertIn("## 导览", md)
+ self.assertIn("## 模型与能力", md)
+ self.assertIn("[OpenAI:Blog ↗](https://openai.com/blog/test)", md)
+ self.assertNotIn("> >", md)
+ self.assertNotIn("[1]", md)
+ self.assertNotIn("主线判断", md)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_project_structure.py b/tests/test_project_structure.py
new file mode 100644
index 0000000..47a71c7
--- /dev/null
+++ b/tests/test_project_structure.py
@@ -0,0 +1,33 @@
+import unittest
+from pathlib import Path
+
+
+ROOT = Path(__file__).resolve().parents[1]
+
+
+class ProjectStructureTests(unittest.TestCase):
+ def test_pipeline_plan_structure_exists(self):
+ expected_paths = [
+ "ai_daily_report/sources/__init__.py",
+ "ai_daily_report/sources/aihot.py",
+ "ai_daily_report/sources/rss.py",
+ "ai_daily_report/sources/juya.py",
+ "ai_daily_report/sources/registry.py",
+ "ai_daily_report/llm.py",
+ "ai_daily_report/validate.py",
+ "ai_daily_report/publish.py",
+ "ai_daily_report/cli.py",
+ "config/sources.json",
+ "config/pipeline.json",
+ "tests/fixtures/.gitkeep",
+ "skill/scripts/.gitkeep",
+ "skill/scripts/run_daily_report.py",
+ ]
+
+ missing = [path for path in expected_paths if not (ROOT / path).exists()]
+
+ self.assertEqual(missing, [])
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_runner.py b/tests/test_runner.py
new file mode 100644
index 0000000..5086f91
--- /dev/null
+++ b/tests/test_runner.py
@@ -0,0 +1,132 @@
+import unittest
+import json
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+from ai_daily_report.runner import run_daily_report
+
+
+class RunnerTests(unittest.TestCase):
+ def test_run_daily_report_mock_mode_writes_markdown_and_reports(self):
+ with TemporaryDirectory() as temp_dir:
+ result = run_daily_report(
+ run_date="2026-06-04",
+ mode="dry-run",
+ source_mode="mock",
+ llm_mode="mock",
+ out_dir=Path(temp_dir),
+ base_url="https://blog.example",
+ )
+
+ run_dir = Path(result["run_dir"])
+ self.assertTrue((run_dir / "blog_markdown.md").exists())
+ self.assertTrue((run_dir / "run_report.json").exists())
+ self.assertEqual(result["reports"]["stage8"]["status"], "ok")
+
+ def test_run_daily_report_live_sources_can_use_config_and_fetch_text(self):
+ with TemporaryDirectory() as temp_dir:
+ out_dir = Path(temp_dir) / "out"
+ source_config = Path(temp_dir) / "sources.json"
+ source_config.write_text(
+ json.dumps(
+ [
+ {
+ "name": "InfoQ AI",
+ "type": "rss",
+ "url": "https://feed.example/rss",
+ "role": "supplement",
+ "priority": 40,
+ "enabled": True,
+ }
+ ]
+ ),
+ encoding="utf-8",
+ )
+
+ def fetch_text(url, timeout):
+ return """- GPT-5 API 发布https://example.com/gpt5OpenAI 发布 GPT-5 API。
"""
+
+ result = run_daily_report(
+ run_date="2026-06-04",
+ mode="dry-run",
+ source_mode="live",
+ llm_mode="mock",
+ out_dir=out_dir,
+ base_url="https://blog.example",
+ sources_path=source_config,
+ fetch_text=fetch_text,
+ )
+
+ self.assertEqual(result["reports"]["stage0"]["raw_item_count"], 1)
+ self.assertTrue((out_dir / "2026-06-04" / "blog_markdown.md").exists())
+
+ def test_run_daily_report_live_llm_uses_env_config_in_dry_run(self):
+ class FakeLlmClient:
+ def __init__(self):
+ self.prompts = []
+
+ def chat(self, prompt):
+ self.prompts.append(prompt)
+ if "duplicate_groups" in prompt:
+ return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
+ if "rewrites" in prompt:
+ payload = json.loads(prompt)
+ return json.dumps(
+ {
+ "rewrites": [
+ {
+ "id": item["id"],
+ "title": item["title_raw"],
+ "summary": item["summary_raw"],
+ "flags": [],
+ }
+ for item in payload["items"]
+ ]
+ }
+ )
+ return json.dumps(
+ {
+ "theme": "模型能力继续进入产品入口。",
+ "threads": [
+ {
+ "title": "模型 API 更新",
+ "text": "GPT-5 API 发布,说明模型能力继续进入产品入口。",
+ "item_ids": [json.loads(prompt)["items"][0]["id"]],
+ "kind": "thread",
+ }
+ ],
+ }
+ )
+
+ fake_client = FakeLlmClient()
+ captured_config = {}
+
+ def llm_client_factory(**config):
+ captured_config.update(config)
+ return fake_client
+
+ with TemporaryDirectory() as temp_dir:
+ result = run_daily_report(
+ run_date="2026-06-04",
+ mode="dry-run",
+ source_mode="mock",
+ llm_mode="live",
+ out_dir=Path(temp_dir),
+ base_url="https://blog.example",
+ env={
+ "LLM_API_KEY": "test-key",
+ "LLM_BASE_URL": "https://llm.example/v1",
+ "LLM_MODEL": "test-model",
+ },
+ llm_client_factory=llm_client_factory,
+ )
+
+ self.assertEqual(captured_config["api_key"], "test-key")
+ self.assertEqual(captured_config["base_url"], "https://llm.example/v1")
+ self.assertEqual(captured_config["model"], "test-model")
+ self.assertGreaterEqual(len(fake_client.prompts), 2)
+ self.assertEqual(result["reports"]["stage8"]["status"], "ok")
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_source_labels.py b/tests/test_source_labels.py
new file mode 100644
index 0000000..9652691
--- /dev/null
+++ b/tests/test_source_labels.py
@@ -0,0 +1,55 @@
+import unittest
+
+from ai_daily_report.models import SourceConfig
+from ai_daily_report.sources.juya import parse_juya_rss
+from ai_daily_report.sources.labels import source_label_from_url
+
+
+class SourceLabelTests(unittest.TestCase):
+ def test_source_label_from_x_url_includes_handle(self):
+ self.assertEqual(
+ source_label_from_url("https://x.com/MiniMax_AI/status/123", fallback="橘鸦AI早报"),
+ "X:MiniMax (@MiniMax_AI)",
+ )
+
+ def test_source_label_from_blog_url_marks_blog(self):
+ self.assertEqual(
+ source_label_from_url("https://openai.com/blog/example", fallback="橘鸦AI早报"),
+ "OpenAI:Blog",
+ )
+
+ def test_source_label_from_known_non_blog_domains(self):
+ self.assertEqual(
+ source_label_from_url("https://mp.weixin.qq.com/s/example", fallback="橘鸦AI早报"),
+ "微信公众号",
+ )
+ self.assertEqual(
+ source_label_from_url("https://platform.minimaxi.com/docs/token-plan/migration", fallback="橘鸦AI早报"),
+ "MiniMax:Docs",
+ )
+
+ def test_parse_juya_rss_uses_item_url_as_source_label(self):
+ config = SourceConfig(name="橘鸦AI早报", type="juya_rss", url="https://juya.example/rss")
+ xml = """
+
+
+ -
+ 2026-06-04
+ MiniMax M3 加速
#1
+ MiniMax M3 加速。
+ 来源
+
+ ]]>
+
+
+"""
+
+ items = parse_juya_rss(config, xml, "2026-06-04")
+
+ self.assertEqual(items[0]["source_label"], "X:MiniMax (@MiniMax_AI)")
+ self.assertNotEqual(items[0]["source_label"], "橘鸦AI早报")
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_stage0_collect.py b/tests/test_stage0_collect.py
new file mode 100644
index 0000000..7d31c20
--- /dev/null
+++ b/tests/test_stage0_collect.py
@@ -0,0 +1,49 @@
+import unittest
+
+from ai_daily_report.collect import collect_sources
+from ai_daily_report.models import SourceConfig
+
+
+class Stage0CollectTests(unittest.TestCase):
+ def test_collect_sources_returns_structured_results_for_each_source(self):
+ configs = [
+ SourceConfig(name="Primary", type="fake", role="primary", priority=10),
+ SourceConfig(name="Supplement", type="fake", role="supplement", priority=20),
+ ]
+
+ def fetcher(config, run_date):
+ return [{"title_raw": f"{config.name} item", "url": f"https://example.com/{config.name}"}]
+
+ results, report = collect_sources(configs, "2026-06-04", fetcher=fetcher)
+
+ self.assertEqual([r.source for r in results], ["Primary", "Supplement"])
+ self.assertTrue(all(r.ok for r in results))
+ self.assertEqual(sum(len(r.items) for r in results), 2)
+ self.assertEqual(report["input_source_count"], 2)
+ self.assertEqual(report["ok_source_count"], 2)
+ self.assertEqual(report["raw_item_count"], 2)
+
+ def test_collect_sources_records_failed_source_without_blocking_others(self):
+ configs = [
+ SourceConfig(name="Broken", type="fake", role="supplement", priority=20),
+ SourceConfig(name="Healthy", type="fake", role="supplement", priority=30),
+ ]
+
+ def fetcher(config, run_date):
+ if config.name == "Broken":
+ raise TimeoutError("timed out")
+ return [{"title_raw": "healthy item", "url": "https://example.com/healthy"}]
+
+ results, report = collect_sources(configs, "2026-06-04", fetcher=fetcher)
+
+ by_source = {r.source: r for r in results}
+ self.assertFalse(by_source["Broken"].ok)
+ self.assertEqual(by_source["Broken"].status, "timeout")
+ self.assertIn("TimeoutError", by_source["Broken"].error)
+ self.assertTrue(by_source["Healthy"].ok)
+ self.assertEqual(report["failed_source_count"], 1)
+ self.assertEqual(report["raw_item_count"], 1)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_stage0_to_2_pipeline.py b/tests/test_stage0_to_2_pipeline.py
new file mode 100644
index 0000000..03469e3
--- /dev/null
+++ b/tests/test_stage0_to_2_pipeline.py
@@ -0,0 +1,32 @@
+import unittest
+
+from ai_daily_report.pipeline import run_stage0_to_stage2
+
+
+class Stage0To2PipelineTests(unittest.TestCase):
+ def test_run_stage0_to_stage2_returns_deduped_items_and_reports(self):
+ configs = [
+ {"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10},
+ {"name": "RSS", "type": "fake", "role": "supplement", "priority": 50},
+ ]
+
+ def fetcher(config, run_date):
+ return [
+ {
+ "title_raw": "OpenAI 发布 GPT-5",
+ "summary_raw": f"{config.name} summary",
+ "url": "https://openai.com/blog/gpt-5?utm_source=test",
+ "source_label": config.name,
+ }
+ ]
+
+ result = run_stage0_to_stage2(configs, "2026-06-04", fetcher=fetcher)
+
+ self.assertEqual(len(result["items"]), 1)
+ self.assertEqual(result["reports"]["stage0"]["raw_item_count"], 2)
+ self.assertEqual(result["reports"]["stage1"]["output_count"], 2)
+ self.assertEqual(result["reports"]["stage2"]["removed_count"], 1)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_stage0_to_4_pipeline.py b/tests/test_stage0_to_4_pipeline.py
new file mode 100644
index 0000000..334c09a
--- /dev/null
+++ b/tests/test_stage0_to_4_pipeline.py
@@ -0,0 +1,66 @@
+import json
+import unittest
+
+from ai_daily_report.pipeline import run_stage0_to_stage4
+
+
+class Stage0To4PipelineTests(unittest.TestCase):
+ def test_run_stage0_to_stage4_semantic_dedupes_and_rewrites(self):
+ configs = [
+ {"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10},
+ {"name": "RSS", "type": "fake", "role": "supplement", "priority": 50},
+ ]
+
+ def fetcher(config, run_date):
+ return [
+ {
+ "title_raw": f"{config.name} Anthropic IPO",
+ "summary_raw": f"{config.name} reports Anthropic IPO filing.",
+ "url": f"https://example.com/{config.name}",
+ "source_label": config.name,
+ }
+ ]
+
+ def semantic_llm_call(prompt):
+ return json.dumps(
+ {
+ "duplicate_groups": [],
+ "not_duplicates": [],
+ "uncertain": [],
+ }
+ )
+
+ def rewrite_llm_call(prompt):
+ payload = json.loads(prompt)
+ return json.dumps(
+ {
+ "rewrites": [
+ {
+ "id": entry["id"],
+ "title": "Anthropic 提交 IPO 文件",
+ "summary": "Anthropic 被报道提交 IPO 文件。",
+ "flags": [],
+ }
+ for entry in payload["items"]
+ ]
+ },
+ ensure_ascii=False,
+ )
+
+ result = run_stage0_to_stage4(
+ configs,
+ "2026-06-04",
+ fetcher=fetcher,
+ semantic_llm_call=semantic_llm_call,
+ rewrite_llm_call=rewrite_llm_call,
+ )
+
+ self.assertEqual(len(result["items"]), 2)
+ self.assertEqual(result["items"][0].title, "Anthropic 提交 IPO 文件")
+ self.assertIn("stage3", result["reports"])
+ self.assertIn("stage4", result["reports"])
+ self.assertEqual(result["reports"]["stage4"]["rewritten_count"], 2)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_stage0_to_5_pipeline.py b/tests/test_stage0_to_5_pipeline.py
new file mode 100644
index 0000000..2df7038
--- /dev/null
+++ b/tests/test_stage0_to_5_pipeline.py
@@ -0,0 +1,62 @@
+import json
+import unittest
+
+from ai_daily_report.pipeline import run_stage0_to_stage5
+
+
+class Stage0To5PipelineTests(unittest.TestCase):
+ def test_run_stage0_to_stage5_classifies_and_orders_items(self):
+ configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}]
+
+ def fetcher(config, run_date):
+ return [
+ {
+ "title_raw": "Anthropic 提交 IPO 文件",
+ "summary_raw": "Anthropic 被报道提交 IPO 文件。",
+ "url": "https://example.com/ipo",
+ "source_label": config.name,
+ },
+ {
+ "title_raw": "GPT-5 API 发布,延迟降低 30%",
+ "summary_raw": "OpenAI 发布 GPT-5 API。",
+ "url": "https://example.com/gpt5",
+ "source_label": config.name,
+ "section_hint": "模型发布/更新",
+ },
+ ]
+
+ def semantic_llm_call(prompt):
+ return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
+
+ def rewrite_llm_call(prompt):
+ payload = json.loads(prompt)
+ return json.dumps(
+ {
+ "rewrites": [
+ {
+ "id": entry["id"],
+ "title": entry["title_raw"],
+ "summary": entry["summary_raw"],
+ "flags": [],
+ }
+ for entry in payload["items"]
+ ]
+ },
+ ensure_ascii=False,
+ )
+
+ result = run_stage0_to_stage5(
+ configs,
+ "2026-06-04",
+ fetcher=fetcher,
+ semantic_llm_call=semantic_llm_call,
+ rewrite_llm_call=rewrite_llm_call,
+ )
+
+ self.assertEqual([item.section for item in result["items"]], ["模型与能力", "公司与资本"])
+ self.assertEqual(result["reports"]["stage5"]["section_counts"]["模型与能力"], 1)
+ self.assertEqual(result["reports"]["stage5"]["section_counts"]["公司与资本"], 1)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_stage0_to_6_pipeline.py b/tests/test_stage0_to_6_pipeline.py
new file mode 100644
index 0000000..4be2807
--- /dev/null
+++ b/tests/test_stage0_to_6_pipeline.py
@@ -0,0 +1,75 @@
+import json
+import unittest
+
+from ai_daily_report.pipeline import run_stage0_to_stage6
+
+
+class Stage0To6PipelineTests(unittest.TestCase):
+ def test_run_stage0_to_stage6_generates_guide(self):
+ configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}]
+
+ def fetcher(config, run_date):
+ return [
+ {
+ "title_raw": "GPT-5 API 发布",
+ "summary_raw": "OpenAI 发布 GPT-5 API。",
+ "url": "https://example.com/gpt5",
+ "source_label": config.name,
+ "section_hint": "模型发布/更新",
+ }
+ ]
+
+ def semantic_llm_call(prompt):
+ return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
+
+ def rewrite_llm_call(prompt):
+ payload = json.loads(prompt)
+ return json.dumps(
+ {
+ "rewrites": [
+ {
+ "id": entry["id"],
+ "title": entry["title_raw"],
+ "summary": entry["summary_raw"],
+ "flags": [],
+ }
+ for entry in payload["items"]
+ ]
+ },
+ ensure_ascii=False,
+ )
+
+ def guide_llm_call(prompt):
+ payload = json.loads(prompt)
+ item_id = payload["items"][0]["id"]
+ return json.dumps(
+ {
+ "theme": "模型 API 能力继续更新。",
+ "threads": [
+ {
+ "title": "模型能力更新",
+ "text": "GPT-5 API 发布,体现模型能力继续产品化。",
+ "item_ids": [item_id],
+ "kind": "thread",
+ }
+ ],
+ },
+ ensure_ascii=False,
+ )
+
+ result = run_stage0_to_stage6(
+ configs,
+ "2026-06-04",
+ fetcher=fetcher,
+ semantic_llm_call=semantic_llm_call,
+ rewrite_llm_call=rewrite_llm_call,
+ guide_llm_call=guide_llm_call,
+ )
+
+ self.assertEqual(result["guide"]["theme"], "模型 API 能力继续更新。")
+ self.assertEqual(len(result["guide"]["threads"]), 1)
+ self.assertTrue(result["reports"]["stage6"]["theme_present"])
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_stage0_to_7_pipeline.py b/tests/test_stage0_to_7_pipeline.py
new file mode 100644
index 0000000..b86e078
--- /dev/null
+++ b/tests/test_stage0_to_7_pipeline.py
@@ -0,0 +1,76 @@
+import json
+import unittest
+
+from ai_daily_report.pipeline import run_stage0_to_stage7
+
+
+class Stage0To7PipelineTests(unittest.TestCase):
+ def test_run_stage0_to_stage7_assembles_markdown(self):
+ configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}]
+
+ def fetcher(config, run_date):
+ return [
+ {
+ "title_raw": "GPT-5 API 发布",
+ "summary_raw": "OpenAI 发布 GPT-5 API。",
+ "url": "https://example.com/gpt5",
+ "source_label": "OpenAI:Blog",
+ "section_hint": "模型发布/更新",
+ }
+ ]
+
+ def semantic_llm_call(prompt):
+ return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
+
+ def rewrite_llm_call(prompt):
+ payload = json.loads(prompt)
+ return json.dumps(
+ {
+ "rewrites": [
+ {
+ "id": entry["id"],
+ "title": entry["title_raw"],
+ "summary": entry["summary_raw"],
+ "flags": [],
+ }
+ for entry in payload["items"]
+ ]
+ },
+ ensure_ascii=False,
+ )
+
+ def guide_llm_call(prompt):
+ payload = json.loads(prompt)
+ item_id = payload["items"][0]["id"]
+ return json.dumps(
+ {
+ "theme": "模型 API 能力继续更新。",
+ "threads": [
+ {
+ "title": "模型能力产品化",
+ "text": "GPT-5 API 发布,说明模型能力继续进入产品入口。",
+ "item_ids": [item_id],
+ "kind": "thread",
+ }
+ ],
+ },
+ ensure_ascii=False,
+ )
+
+ result = run_stage0_to_stage7(
+ configs,
+ "2026-06-04",
+ fetcher=fetcher,
+ semantic_llm_call=semantic_llm_call,
+ rewrite_llm_call=rewrite_llm_call,
+ guide_llm_call=guide_llm_call,
+ )
+
+ self.assertIn("## 导览", result["markdown"])
+ self.assertIn("## 模型与能力", result["markdown"])
+ self.assertIn("## 今日脉络", result["markdown"])
+ self.assertEqual(result["reports"]["stage7"]["blocking_errors"], [])
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_stage0_to_8_pipeline.py b/tests/test_stage0_to_8_pipeline.py
new file mode 100644
index 0000000..a81861c
--- /dev/null
+++ b/tests/test_stage0_to_8_pipeline.py
@@ -0,0 +1,79 @@
+import json
+import unittest
+
+from ai_daily_report.pipeline import run_stage0_to_stage8
+
+
+class Stage0To8PipelineTests(unittest.TestCase):
+ def test_run_stage0_to_stage8_dry_run_publishes_report(self):
+ configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}]
+
+ def fetcher(config, run_date):
+ return [
+ {
+ "title_raw": "GPT-5 API 发布",
+ "summary_raw": "OpenAI 发布 GPT-5 API。",
+ "url": "https://example.com/gpt5",
+ "source_label": "OpenAI:Blog",
+ "section_hint": "模型发布/更新",
+ }
+ ]
+
+ def semantic_llm_call(prompt):
+ return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
+
+ def rewrite_llm_call(prompt):
+ payload = json.loads(prompt)
+ return json.dumps(
+ {
+ "rewrites": [
+ {
+ "id": entry["id"],
+ "title": entry["title_raw"],
+ "summary": entry["summary_raw"],
+ "flags": [],
+ }
+ for entry in payload["items"]
+ ]
+ },
+ ensure_ascii=False,
+ )
+
+ def guide_llm_call(prompt):
+ payload = json.loads(prompt)
+ item_id = payload["items"][0]["id"]
+ return json.dumps(
+ {
+ "theme": "模型 API 能力继续更新。",
+ "threads": [
+ {
+ "title": "模型能力产品化",
+ "text": "GPT-5 API 发布,说明模型能力继续进入产品入口。",
+ "item_ids": [item_id],
+ "kind": "thread",
+ }
+ ],
+ },
+ ensure_ascii=False,
+ )
+
+ result = run_stage0_to_stage8(
+ configs,
+ "2026-06-04",
+ fetcher=fetcher,
+ semantic_llm_call=semantic_llm_call,
+ rewrite_llm_call=rewrite_llm_call,
+ guide_llm_call=guide_llm_call,
+ mode="dry-run",
+ base_url="https://blog.example",
+ client=None,
+ )
+
+ self.assertEqual(result["publish"].status, "ok")
+ self.assertEqual(result["publish"].blog_url, "https://blog.example/posts/ai-2026-06-04")
+ self.assertIn("stage8", result["reports"])
+ self.assertEqual(result["reports"]["stage8"]["status"], "ok")
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_stage1_normalize.py b/tests/test_stage1_normalize.py
new file mode 100644
index 0000000..d75358a
--- /dev/null
+++ b/tests/test_stage1_normalize.py
@@ -0,0 +1,85 @@
+import unittest
+
+from ai_daily_report.models import SourceResult
+from ai_daily_report.normalize import canonicalize_url, normalize_items, normalize_title
+
+
+class Stage1NormalizeTests(unittest.TestCase):
+ def test_canonicalize_url_removes_tracking_and_normalizes_x_host(self):
+ url = "HTTPS://Twitter.com/OpenAI/status/123/?utm_source=newsletter&fbclid=abc#fragment"
+
+ self.assertEqual(canonicalize_url(url), "https://x.com/OpenAI/status/123")
+
+ def test_normalize_items_builds_news_items_with_ids_and_norms(self):
+ source_result = SourceResult(
+ source="AI HOT",
+ role="primary",
+ ok=True,
+ status="ok",
+ items=[
+ {
+ "title_raw": " GPT-5 发布:速度提升 2x! ",
+ "summary_raw": " OpenAI 发布更新。
",
+ "url": "https://openai.com/blog/gpt-5?utm_campaign=test",
+ "source_label": "OpenAI:Blog",
+ "section_hint": "模型发布/更新",
+ }
+ ],
+ )
+
+ items, report = normalize_items([source_result], run_date="2026-06-04")
+
+ self.assertEqual(len(items), 1)
+ self.assertTrue(items[0].id.startswith("item_"))
+ self.assertEqual(items[0].canonical_url, "https://openai.com/blog/gpt-5")
+ self.assertEqual(items[0].title_norm, normalize_title("GPT-5 发布:速度提升 2x!"))
+ self.assertEqual(items[0].summary_raw, "OpenAI 发布更新。")
+ self.assertEqual(items[0].source_role, "primary")
+ self.assertEqual(report["input_count"], 1)
+ self.assertEqual(report["output_count"], 1)
+
+ def test_normalize_items_marks_quality_flags_without_dropping_item(self):
+ source_result = SourceResult(
+ source="RSS",
+ role="supplement",
+ ok=True,
+ status="ok",
+ items=[{"title_raw": "短", "summary_raw": "", "url": ""}],
+ )
+
+ items, report = normalize_items([source_result], run_date="2026-06-04")
+
+ self.assertEqual(len(items), 1)
+ self.assertIn("missing_url", items[0].quality_flags)
+ self.assertIn("missing_summary", items[0].quality_flags)
+ self.assertIn("short_title", items[0].quality_flags)
+ self.assertEqual(report["quality_flag_counts"]["missing_url"], 1)
+
+ def test_normalize_items_keeps_ids_unique_for_same_canonical_url(self):
+ source_result = SourceResult(
+ source="AI HOT",
+ role="primary",
+ ok=True,
+ status="ok",
+ items=[
+ {
+ "title_raw": "OpenAI 发布 GPT-5",
+ "summary_raw": "summary a",
+ "url": "https://example.com/news?utm_source=a",
+ },
+ {
+ "title_raw": "OpenAI 发布 GPT-5",
+ "summary_raw": "summary b",
+ "url": "https://example.com/news",
+ },
+ ],
+ )
+
+ items, _ = normalize_items([source_result], run_date="2026-06-04")
+
+ self.assertEqual(len({item.id for item in items}), 2)
+ self.assertEqual(items[0].canonical_url, items[1].canonical_url)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_stage2_dedupe.py b/tests/test_stage2_dedupe.py
new file mode 100644
index 0000000..0809889
--- /dev/null
+++ b/tests/test_stage2_dedupe.py
@@ -0,0 +1,63 @@
+import unittest
+
+from ai_daily_report.dedupe import hard_dedup_items
+from ai_daily_report.models import NewsItem
+
+
+def item(
+ item_id,
+ title,
+ title_norm,
+ url,
+ canonical_url,
+ source_group="AI HOT",
+ source_label="AI HOT",
+ source_priority=100,
+ summary="summary",
+):
+ return NewsItem(
+ id=item_id,
+ source_group=source_group,
+ source_label=source_label,
+ source_role="primary" if source_group == "AI HOT" else "supplement",
+ source_priority=source_priority,
+ title_raw=title,
+ title_norm=title_norm,
+ summary_raw=summary,
+ url=url,
+ canonical_url=canonical_url,
+ )
+
+
+class Stage2DedupeTests(unittest.TestCase):
+ def test_hard_dedup_merges_same_canonical_url_and_keeps_better_item(self):
+ items = [
+ item("a", "OpenAI 发布 GPT-5", "openai发布gpt5", "https://example.com/a?utm_source=x", "https://example.com/a", source_group="RSS", source_priority=50, summary="short"),
+ item("b", "OpenAI 发布 GPT-5", "openai发布gpt5", "https://example.com/a", "https://example.com/a", source_group="AI HOT", source_priority=10, summary="longer summary"),
+ ]
+
+ deduped, report = hard_dedup_items(items)
+
+ self.assertEqual([i.id for i in deduped], ["b"])
+ self.assertEqual(report["input_count"], 2)
+ self.assertEqual(report["output_count"], 1)
+ self.assertEqual(report["removed_count"], 1)
+ self.assertEqual(report["groups"][0]["reason"], "same_canonical_url")
+ self.assertEqual(deduped[0].duplicate_sources[0]["source_group"], "RSS")
+
+ def test_hard_dedup_marks_similar_titles_without_removing(self):
+ items = [
+ item("a", "Grok API 上线 Cloudflare Gateway", "grokapi上线cloudflaregateway", "https://x.com/a", "https://x.com/a"),
+ item("b", "Grok 模型登陆 Cloudflare AI Gateway", "grok模型登陆cloudflareaigateway", "https://x.com/b", "https://x.com/b"),
+ ]
+
+ deduped, report = hard_dedup_items(items)
+
+ self.assertEqual(len(deduped), 2)
+ self.assertEqual(report["removed_count"], 0)
+ self.assertEqual(len(report["possible_duplicates"]), 1)
+ self.assertEqual(set(report["possible_duplicates"][0]["item_ids"]), {"a", "b"})
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_stage3_semantic_dedupe.py b/tests/test_stage3_semantic_dedupe.py
new file mode 100644
index 0000000..ed876a5
--- /dev/null
+++ b/tests/test_stage3_semantic_dedupe.py
@@ -0,0 +1,129 @@
+import json
+import unittest
+
+from ai_daily_report.models import NewsItem
+from ai_daily_report.semantic_dedupe import semantic_dedup_items
+
+
+def news_item(item_id, title, source_group="AI HOT"):
+ return NewsItem(
+ id=item_id,
+ source_group=source_group,
+ source_label=source_group,
+ source_role="primary" if source_group == "AI HOT" else "supplement",
+ source_priority=10 if source_group == "AI HOT" else 50,
+ title_raw=title,
+ title_norm=title.lower(),
+ summary_raw=f"{title} summary",
+ url=f"https://example.com/{item_id}",
+ canonical_url=f"https://example.com/{item_id}",
+ )
+
+
+class Stage3SemanticDedupeTests(unittest.TestCase):
+ def test_semantic_dedup_removes_only_high_confidence_duplicates(self):
+ items = [
+ news_item("a", "Anthropic 提交 IPO 招股书", "AI HOT"),
+ news_item("b", "刚刚,Anthropic 提交了招股书", "量子位"),
+ news_item("c", "Grok 上线 Cloudflare Gateway", "AI HOT"),
+ ]
+ candidates = [{"item_ids": ["a", "b"], "reason": "title_similarity"}]
+
+ def llm_call(prompt):
+ return json.dumps(
+ {
+ "duplicate_groups": [
+ {
+ "keep_id": "a",
+ "remove_ids": ["b"],
+ "confidence": "high",
+ "reason": "same IPO filing event",
+ }
+ ],
+ "not_duplicates": [],
+ "uncertain": [],
+ }
+ )
+
+ deduped, report = semantic_dedup_items(items, candidates, llm_call=llm_call)
+
+ self.assertEqual([item.id for item in deduped], ["a", "c"])
+ self.assertEqual(report["removed_count"], 1)
+ self.assertEqual(report["duplicate_groups"][0]["reason"], "same IPO filing event")
+ self.assertEqual(deduped[0].duplicate_sources[0]["id"], "b")
+
+ def test_semantic_dedup_skips_deletion_when_ratio_exceeds_limit(self):
+ items = [
+ news_item("a", "A"),
+ news_item("b", "B"),
+ news_item("c", "C"),
+ ]
+ candidates = [{"item_ids": ["a", "b", "c"], "reason": "llm_candidate"}]
+
+ def llm_call(prompt):
+ return json.dumps(
+ {
+ "duplicate_groups": [
+ {
+ "keep_id": "a",
+ "remove_ids": ["b", "c"],
+ "confidence": "high",
+ "reason": "too broad",
+ }
+ ],
+ "not_duplicates": [],
+ "uncertain": [],
+ }
+ )
+
+ deduped, report = semantic_dedup_items(
+ items,
+ candidates,
+ llm_call=llm_call,
+ max_deletion_ratio=0.5,
+ )
+
+ self.assertEqual(len(deduped), 3)
+ self.assertEqual(report["removed_count"], 0)
+ self.assertTrue(report["skipped_for_deletion_ratio"])
+
+ def test_semantic_dedup_ignores_groups_outside_candidate_sets(self):
+ items = [
+ news_item("a", "Suno 完成融资"),
+ news_item("b", "Suno 完成 D 轮融资"),
+ news_item("c", "Ideogram 发布 v4"),
+ news_item("d", "OpenClaw 发布新版"),
+ ]
+ candidates = [{"item_ids": ["a", "b"], "reason": "title_similarity"}]
+
+ def llm_call(prompt):
+ return json.dumps(
+ {
+ "duplicate_groups": [
+ {
+ "keep_id": "a",
+ "remove_ids": ["b"],
+ "confidence": "high",
+ "reason": "same Suno event",
+ },
+ {
+ "keep_id": "c",
+ "remove_ids": ["d"],
+ "confidence": "high",
+ "reason": "not part of candidates",
+ },
+ ],
+ "not_duplicates": [],
+ "uncertain": [],
+ }
+ )
+
+ deduped, report = semantic_dedup_items(items, candidates, llm_call=llm_call)
+
+ self.assertEqual([item.id for item in deduped], ["a", "c", "d"])
+ self.assertEqual(report["removed_count"], 1)
+ self.assertIn("group_outside_candidates", report["errors"][0])
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_stage4_rewrite.py b/tests/test_stage4_rewrite.py
new file mode 100644
index 0000000..62ef346
--- /dev/null
+++ b/tests/test_stage4_rewrite.py
@@ -0,0 +1,96 @@
+import json
+import unittest
+
+from ai_daily_report.models import NewsItem
+from ai_daily_report.rewrite import rewrite_items
+
+
+def news_item(item_id="a"):
+ return NewsItem(
+ id=item_id,
+ source_group="AI HOT",
+ source_label="AI HOT",
+ source_role="primary",
+ source_priority=10,
+ title_raw="OpenAI launches GPT-5 API",
+ title_norm="openailaunchesgpt5api",
+ summary_raw="OpenAI launched the GPT-5 API with better latency.",
+ url="https://example.com/a",
+ canonical_url="https://example.com/a",
+ )
+
+
+class Stage4RewriteTests(unittest.TestCase):
+ def test_rewrite_items_writes_display_fields_without_overwriting_raw(self):
+ items = [news_item("a")]
+
+ def llm_call(prompt):
+ return json.dumps(
+ {
+ "rewrites": [
+ {
+ "id": "a",
+ "title": "OpenAI 发布 GPT-5 API",
+ "summary": "OpenAI 发布 GPT-5 API,延迟表现更好。",
+ "flags": [],
+ }
+ ]
+ },
+ ensure_ascii=False,
+ )
+
+ rewritten, report = rewrite_items(items, llm_call=llm_call, batch_size=10)
+
+ self.assertEqual(rewritten[0].title, "OpenAI 发布 GPT-5 API")
+ self.assertEqual(rewritten[0].summary, "OpenAI 发布 GPT-5 API,延迟表现更好。")
+ self.assertEqual(rewritten[0].title_raw, "OpenAI launches GPT-5 API")
+ self.assertEqual(report["rewritten_count"], 1)
+ self.assertEqual(report["fallback_count"], 0)
+
+ def test_rewrite_items_falls_back_when_llm_fails(self):
+ items = [news_item("a")]
+
+ def llm_call(prompt):
+ raise TimeoutError("slow")
+
+ rewritten, report = rewrite_items(items, llm_call=llm_call, batch_size=10)
+
+ self.assertEqual(rewritten[0].title, "OpenAI launches GPT-5 API")
+ self.assertEqual(rewritten[0].summary, "OpenAI launched the GPT-5 API with better latency.")
+ self.assertEqual(report["rewritten_count"], 0)
+ self.assertEqual(report["fallback_count"], 1)
+ self.assertIn("TimeoutError", report["errors"][0])
+
+ def test_rewrite_items_retries_failed_batch_as_single_items(self):
+ items = [news_item("a"), news_item("b")]
+ calls = []
+
+ def llm_call(prompt):
+ payload = json.loads(prompt)
+ ids = [item["id"] for item in payload["items"]]
+ calls.append(ids)
+ if len(ids) > 1:
+ return "not json"
+ return json.dumps(
+ {
+ "rewrites": [
+ {
+ "id": ids[0],
+ "title": f"title {ids[0]}",
+ "summary": f"summary {ids[0]}",
+ "flags": [],
+ }
+ ]
+ }
+ )
+
+ rewritten, report = rewrite_items(items, llm_call=llm_call, batch_size=2)
+
+ self.assertEqual([item.title for item in rewritten], ["title a", "title b"])
+ self.assertEqual(report["rewritten_count"], 2)
+ self.assertEqual(report["fallback_count"], 0)
+ self.assertEqual(calls, [["a", "b"], ["a"], ["b"]])
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_stage5_classify.py b/tests/test_stage5_classify.py
new file mode 100644
index 0000000..a158ca3
--- /dev/null
+++ b/tests/test_stage5_classify.py
@@ -0,0 +1,61 @@
+import unittest
+
+from ai_daily_report.classify import SECTION_ORDER, classify_and_order_items
+from ai_daily_report.models import NewsItem
+
+
+def news_item(item_id, title, summary="", section_hint="", source_priority=50):
+ return NewsItem(
+ id=item_id,
+ source_group="AI HOT",
+ source_label="AI HOT",
+ source_role="primary",
+ source_priority=source_priority,
+ title_raw=title,
+ title_norm=title.lower(),
+ summary_raw=summary or f"{title} summary",
+ title=title,
+ summary=summary or f"{title} summary",
+ url=f"https://example.com/{item_id}",
+ canonical_url=f"https://example.com/{item_id}",
+ section_hint=section_hint,
+ )
+
+
+class Stage5ClassifyTests(unittest.TestCase):
+ def test_classify_maps_legacy_section_hints_to_new_sections(self):
+ items = [news_item("a", "GPT-5 发布", section_hint="模型发布/更新")]
+
+ classified, report = classify_and_order_items(items)
+
+ self.assertEqual(classified[0].section, "模型与能力")
+ self.assertEqual(report["hint_classified"], 1)
+ self.assertIn("模型与能力", SECTION_ORDER)
+
+ def test_classify_uses_rules_when_hint_is_missing(self):
+ items = [
+ news_item("a", "Anthropic 提交 IPO 文件", summary="Anthropic 计划上市并提交文件。"),
+ news_item("b", "MCP SDK 发布新版", summary="开发者可用新版 SDK 构建工具。"),
+ ]
+
+ classified, report = classify_and_order_items(items)
+ by_id = {item.id: item for item in classified}
+
+ self.assertEqual(by_id["a"].section, "公司与资本")
+ self.assertEqual(by_id["b"].section, "开发与基础设施")
+ self.assertEqual(report["rule_classified"], 2)
+
+ def test_classify_orders_items_by_local_rank_score_within_sections(self):
+ items = [
+ news_item("low", "普通模型更新", section_hint="模型发布/更新", source_priority=80),
+ news_item("high", "GPT-5 API 发布,延迟降低 30%", section_hint="模型发布/更新", source_priority=10),
+ ]
+
+ classified, report = classify_and_order_items(items)
+
+ self.assertEqual([item.id for item in classified], ["high", "low"])
+ self.assertEqual(report["section_counts"]["模型与能力"], 2)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_stage6_guide.py b/tests/test_stage6_guide.py
new file mode 100644
index 0000000..4399c4b
--- /dev/null
+++ b/tests/test_stage6_guide.py
@@ -0,0 +1,77 @@
+import json
+import unittest
+
+from ai_daily_report.guide import generate_guide
+from ai_daily_report.models import NewsItem
+
+
+def news_item(item_id, title, section="模型与能力"):
+ return NewsItem(
+ id=item_id,
+ source_group="AI HOT",
+ source_label="AI HOT",
+ source_role="primary",
+ source_priority=10,
+ title_raw=title,
+ title_norm=title.lower(),
+ summary_raw=f"{title} summary",
+ title=title,
+ summary=f"{title} summary",
+ url=f"https://example.com/{item_id}",
+ canonical_url=f"https://example.com/{item_id}",
+ section=section,
+ )
+
+
+class Stage6GuideTests(unittest.TestCase):
+ def test_generate_guide_returns_theme_and_valid_threads(self):
+ items = [
+ news_item("a", "GPT-5 API 发布"),
+ news_item("b", "Miso One 开源语音模型"),
+ ]
+
+ def llm_call(prompt):
+ return json.dumps(
+ {
+ "theme": "模型能力继续向 API 和实时语音两端推进。",
+ "threads": [
+ {
+ "title": "模型能力继续推进",
+ "text": "GPT-5 API 和 Miso One 分别代表 API 能力和语音模型更新。",
+ "item_ids": ["a", "b"],
+ "kind": "thread",
+ },
+ {
+ "title": "无效脉络",
+ "text": "这条引用了不存在的条目。",
+ "item_ids": ["missing"],
+ "kind": "thread",
+ },
+ ],
+ },
+ ensure_ascii=False,
+ )
+
+ guide, report = generate_guide(items, llm_call=llm_call)
+
+ self.assertEqual(guide["theme"], "模型能力继续向 API 和实时语音两端推进。")
+ self.assertEqual(len(guide["threads"]), 1)
+ self.assertEqual(guide["threads"][0]["item_ids"], ["a", "b"])
+ self.assertEqual(report["dropped_thread_count"], 1)
+
+ def test_generate_guide_falls_back_when_llm_fails(self):
+ items = [news_item("a", "GPT-5 API 发布")]
+
+ def llm_call(prompt):
+ raise TimeoutError("slow")
+
+ guide, report = generate_guide(items, llm_call=llm_call)
+
+ self.assertEqual(guide["theme"], "")
+ self.assertEqual(guide["threads"], [])
+ self.assertTrue(report["fallback_used"])
+ self.assertIn("TimeoutError", report["errors"][0])
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_stage7_assemble.py b/tests/test_stage7_assemble.py
new file mode 100644
index 0000000..e79b7e1
--- /dev/null
+++ b/tests/test_stage7_assemble.py
@@ -0,0 +1,65 @@
+import unittest
+
+from ai_daily_report.assemble import assemble_markdown, validate_markdown
+from ai_daily_report.models import NewsItem
+
+
+def news_item(item_id, title, section):
+ return NewsItem(
+ id=item_id,
+ source_group="AI HOT",
+ source_label="OpenAI:Blog",
+ source_role="primary",
+ source_priority=10,
+ title_raw=title,
+ title_norm=title.lower(),
+ summary_raw=f"{title} summary",
+ title=title,
+ summary=f"{title} summary",
+ url=f"https://example.com/{item_id}",
+ canonical_url=f"https://example.com/{item_id}",
+ section=section,
+ )
+
+
+class Stage7AssembleTests(unittest.TestCase):
+ def test_assemble_markdown_renders_sections_and_daily_threads(self):
+ items = [
+ news_item("a", "GPT-5 API 发布", "模型与能力"),
+ news_item("b", "Anthropic 提交 IPO 文件", "公司与资本"),
+ ]
+ guide = {
+ "theme": "> 模型和资本两条线都在推进。[1]",
+ "threads": [
+ {
+ "title": "模型能力产品化",
+ "text": "GPT-5 API 发布,说明模型能力继续进入产品入口。",
+ "item_ids": ["a"],
+ "kind": "thread",
+ }
+ ],
+ }
+
+ md, report = assemble_markdown(items, guide)
+
+ self.assertIn("## 导览", md)
+ self.assertIn("> 模型和资本两条线都在推进。", md)
+ self.assertIn("## 模型与能力", md)
+ self.assertIn("**1. GPT-5 API 发布**", md)
+ self.assertIn("**2. Anthropic 提交 IPO 文件**", md)
+ self.assertIn("## 今日脉络", md)
+ self.assertIn("- **模型能力产品化**", md)
+ self.assertNotIn("> >", md)
+ self.assertNotIn("[1]", md)
+ self.assertEqual(report["item_count"], 2)
+ self.assertEqual(report["blocking_errors"], [])
+
+ def test_validate_markdown_blocks_empty_report(self):
+ report = validate_markdown("", [])
+
+ self.assertIn("no_items", report["blocking_errors"])
+ self.assertIn("markdown_too_short", report["blocking_errors"])
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_stage8_publish.py b/tests/test_stage8_publish.py
new file mode 100644
index 0000000..0f7e342
--- /dev/null
+++ b/tests/test_stage8_publish.py
@@ -0,0 +1,76 @@
+import unittest
+
+from ai_daily_report.publish import publish_markdown
+
+
+class FakeBlogClient:
+ def __init__(self):
+ self.created_payload = None
+ self.published_slug = None
+
+ def create_post(self, payload):
+ self.created_payload = payload
+ return {"slug": "ai-2026-06-04"}
+
+ def publish_post(self, slug):
+ self.published_slug = slug
+
+
+class Stage8PublishTests(unittest.TestCase):
+ def test_publish_markdown_dry_run_does_not_call_client(self):
+ result = publish_markdown(
+ title="AI日报 · 2026-06-04",
+ markdown="## 导览\n\n> ok",
+ tags=["AI日报"],
+ slug="ai-2026-06-04",
+ base_url="https://blog.example",
+ mode="dry-run",
+ markdown_report={"blocking_errors": []},
+ client=None,
+ )
+
+ self.assertEqual(result.status, "ok")
+ self.assertEqual(result.mode, "dry-run")
+ self.assertEqual(result.blog_url, "https://blog.example/posts/ai-2026-06-04")
+ self.assertTrue(result.public_ok)
+
+ def test_publish_markdown_blocks_when_markdown_has_errors(self):
+ client = FakeBlogClient()
+
+ result = publish_markdown(
+ title="AI日报 · 2026-06-04",
+ markdown="bad",
+ tags=["AI日报"],
+ slug="ai-2026-06-04",
+ base_url="https://blog.example",
+ mode="publish",
+ markdown_report={"blocking_errors": ["markdown_too_short"]},
+ client=client,
+ )
+
+ self.assertEqual(result.status, "blocked")
+ self.assertIsNone(client.created_payload)
+ self.assertIn("markdown_too_short", result.error)
+
+ def test_publish_markdown_publish_mode_calls_client(self):
+ client = FakeBlogClient()
+
+ result = publish_markdown(
+ title="AI日报 · 2026-06-04",
+ markdown="## 导览\n\n> ok",
+ tags=["AI日报"],
+ slug="ai-2026-06-04",
+ base_url="https://blog.example",
+ mode="publish",
+ markdown_report={"blocking_errors": []},
+ client=client,
+ )
+
+ self.assertEqual(result.status, "ok")
+ self.assertEqual(client.created_payload["title"], "AI日报 · 2026-06-04")
+ self.assertEqual(client.published_slug, "ai-2026-06-04")
+ self.assertEqual(result.blog_url, "https://blog.example/posts/ai-2026-06-04")
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_validate.py b/tests/test_validate.py
new file mode 100644
index 0000000..48a42f3
--- /dev/null
+++ b/tests/test_validate.py
@@ -0,0 +1,14 @@
+import unittest
+
+from ai_daily_report.validate import validate_report_markdown
+
+
+class ValidateTests(unittest.TestCase):
+ def test_validate_report_markdown_delegates_markdown_checks(self):
+ report = validate_report_markdown("", [])
+
+ self.assertIn("no_items", report["blocking_errors"])
+
+
+if __name__ == "__main__":
+ unittest.main()