Refactor AI daily report pipeline

This commit is contained in:
Mimikko-zeus
2026-06-04 15:21:56 +08:00
parent 94e18ce22d
commit 5a98696255
64 changed files with 4778 additions and 1316 deletions

9
.gitignore vendored Normal file
View File

@@ -0,0 +1,9 @@
.env
.env.*
!.env.example
__pycache__/
*.py[cod]
.pytest_cache/
runs/
runs-*/
.idea/

View File

@@ -0,0 +1,2 @@
"""Core package for the AI daily report pipeline."""

View File

@@ -0,0 +1,77 @@
from __future__ import annotations
import re
from typing import Any
from .classify import SECTION_ORDER
from .models import NewsItem
from .validate import validate_markdown
END_PUNCTUATION = "。!?;.!?;"
def _clean_text(text: str) -> str:
value = re.sub(r"^```(?:\w+)?\s*\n?", "", (text or "").strip())
value = re.sub(r"\n?```\s*$", "", value)
value = re.sub(r"^\s*>\s*", "", value)
value = re.sub(r"\[\d+\]|\[N\]", "", value)
value = re.sub(r"主线判断[:]\s*", "", value)
value = re.sub(r"\s+", " ", value).strip()
return value
def _ensure_sentence(text: str) -> str:
value = _clean_text(text)
if value and value[-1] not in END_PUNCTUATION:
value += ""
return value
def _source_link(item: NewsItem) -> str:
source = item.source_label or item.source_group or "来源"
if item.url:
return f"[{source} ↗]({item.url})"
return source
def assemble_markdown(items: list[NewsItem], guide: dict[str, Any] | None = None) -> tuple[str, dict[str, Any]]:
guide = guide or {"theme": "", "threads": []}
lines: list[str] = []
theme = _clean_text(str(guide.get("theme") or ""))
if theme:
lines.extend(["## 导览", "", f"> {theme}", ""])
item_number = 1
for section in SECTION_ORDER:
section_items = [item for item in items if item.section == section]
if not section_items:
continue
lines.extend([f"## {section}", ""])
for item in section_items:
title = _clean_text(item.title or item.title_raw)
summary = _ensure_sentence(item.summary or item.summary_raw or "该条目暂无摘要。")
lines.extend(
[
f"**{item_number}. {title}**",
"",
f"> {summary}{_source_link(item)}",
"",
]
)
item_number += 1
threads = guide.get("threads", []) or []
if threads:
lines.extend(["## 今日脉络", ""])
for thread in threads:
title = _clean_text(str(thread.get("title") or ""))
text = _ensure_sentence(str(thread.get("text") or ""))
if not title or not text:
continue
lines.extend([f"- **{title}**", f" {text}", ""])
markdown = "\n".join(lines).strip()
report = validate_markdown(markdown, items)
return markdown, report

109
ai_daily_report/classify.py Normal file
View File

@@ -0,0 +1,109 @@
from __future__ import annotations
from collections import Counter
from typing import Any
from .models import NewsItem
SECTION_ORDER = [
"模型与能力",
"产品与应用",
"开发与基础设施",
"公司与资本",
"政策与安全",
"论文与研究",
"观点与教程",
"人物与动态",
]
SECTION_ALIASES = {
"模型发布/更新": "模型与能力",
"产品发布/更新": "产品与应用",
"产品与工具": "产品与应用",
"开发与工程": "开发与基础设施",
"行业动态": "公司与资本",
"行业与公司": "公司与资本",
"论文研究": "论文与研究",
"论文与研究": "论文与研究",
"技巧与观点": "观点与教程",
"观点与教程": "观点与教程",
"人物与花絮": "人物与动态",
}
RULES = [
("政策与安全", ("监管", "政策", "安全", "风险", "滥用", "攻击", "合规", "版权")),
("论文与研究", ("论文", "研究", "arxiv", "cvpr", "benchmark", "评测", "实验")),
("开发与基础设施", ("sdk", "api", "mcp", "kubernetes", "框架", "开源", "github", "部署", "基础设施")),
("公司与资本", ("融资", "ipo", "上市", "招股书", "合作", "估值", "收购", "资本")),
("模型与能力", ("模型", "gpt", "claude", "gemini", "grok", "token", "参数", "多模态", "语音", "推理")),
("产品与应用", ("agent", "应用", "产品", "平台", "上线", "工具", "智能体")),
("观点与教程", ("教程", "观点", "方法论", "guide", "实践", "技巧")),
("人物与动态", ("黄仁勋", "纳德拉", "访谈", "演讲", "人物")),
]
def normalize_section_hint(section_hint: str) -> str:
hint = (section_hint or "").strip()
if hint in SECTION_ORDER:
return hint
return SECTION_ALIASES.get(hint, "")
def rule_classify(item: NewsItem) -> str:
text = f"{item.title or item.title_raw} {item.summary or item.summary_raw}".lower()
for section, keywords in RULES:
if any(keyword.lower() in text for keyword in keywords):
return section
return "公司与资本"
def rank_score(item: NewsItem) -> int:
text = f"{item.title or item.title_raw} {item.summary or item.summary_raw}"
score = max(0, 200 - item.source_priority)
if item.source_role == "primary":
score += 10
if item.canonical_url:
score += 10
if any(ch.isdigit() for ch in text):
score += 10
if item.duplicate_sources:
score += min(20, len(item.duplicate_sources) * 5)
score -= len(item.quality_flags) * 10
return score
def classify_and_order_items(items: list[NewsItem]) -> tuple[list[NewsItem], dict[str, Any]]:
hint_classified = 0
rule_classified = 0
for item in items:
mapped = normalize_section_hint(item.section_hint)
if mapped:
item.section = mapped
hint_classified += 1
else:
item.section = rule_classify(item)
rule_classified += 1
section_index = {section: index for index, section in enumerate(SECTION_ORDER)}
ordered = sorted(
items,
key=lambda item: (
section_index.get(item.section or "", len(SECTION_ORDER)),
-rank_score(item),
item.title or item.title_raw,
),
)
section_counts = Counter(item.section for item in ordered if item.section)
report = {
"input_count": len(items),
"section_counts": dict(section_counts),
"hint_classified": hint_classified,
"rule_classified": rule_classified,
"llm_classified": 0,
"fallback_classified": 0,
"invalid_section_count": sum(1 for item in ordered if item.section not in SECTION_ORDER),
}
return ordered, report

40
ai_daily_report/cli.py Normal file
View File

@@ -0,0 +1,40 @@
from __future__ import annotations
import argparse
from pathlib import Path
from .runner import run_daily_report
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(prog="ai-daily-report")
subcommands = parser.add_subparsers(dest="command")
run = subcommands.add_parser("run")
run.add_argument("--date", default="today")
run.add_argument("--mode", choices=["dry-run", "draft", "publish"], default="dry-run")
run.add_argument("--source-mode", choices=["mock", "live"], default="mock")
run.add_argument("--llm-mode", choices=["mock", "live"], default="mock")
run.add_argument("--out-dir", default="runs")
run.add_argument("--base-url", default="https://blog.ephron.ren")
run.add_argument("--sources-path", default=None)
return parser
def main(argv: list[str] | None = None) -> int:
parser = build_parser()
args = parser.parse_args(argv)
if args.command == "run":
run_daily_report(
run_date=args.date,
mode=args.mode,
source_mode=args.source_mode,
llm_mode=args.llm_mode,
out_dir=Path(args.out_dir),
base_url=args.base_url,
sources_path=Path(args.sources_path) if args.sources_path else None,
)
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,64 @@
from __future__ import annotations
import json
import urllib.request
from typing import Any
UA = "Mozilla/5.0 (compatible; ai-daily-report/1.0)"
def fetch_text(url: str, timeout_seconds: int) -> str:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=timeout_seconds) as response:
return response.read().decode("utf-8", "ignore")
class OpenAICompatibleClient:
def __init__(self, *, api_key: str, base_url: str, model: str, timeout_seconds: int = 600):
self.api_key = api_key
self.base_url = base_url.rstrip("/")
self.model = model
self.timeout_seconds = timeout_seconds
def chat(self, prompt: str) -> str:
payload = json.dumps(
{
"model": self.model,
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.2,
"max_tokens": 8000,
},
ensure_ascii=False,
).encode("utf-8")
req = urllib.request.Request(
f"{self.base_url}/chat/completions",
data=payload,
headers={"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"},
)
with urllib.request.urlopen(req, timeout=self.timeout_seconds) as response:
data = json.loads(response.read().decode("utf-8"))
return data["choices"][0]["message"]["content"].strip()
class BlogApiClient:
def __init__(self, *, base_url: str, token: str, timeout_seconds: int = 25):
self.base_url = base_url.rstrip("/")
self.token = token
self.timeout_seconds = timeout_seconds
def _request(self, method: str, path: str, payload: dict[str, Any] | None = None) -> dict[str, Any]:
data = None
headers = {"Authorization": f"Bearer {self.token}", "User-Agent": UA}
if payload is not None:
data = json.dumps(payload, ensure_ascii=False).encode("utf-8")
headers["Content-Type"] = "application/json"
req = urllib.request.Request(f"{self.base_url}{path}", data=data, headers=headers, method=method)
with urllib.request.urlopen(req, timeout=self.timeout_seconds) as response:
return json.loads(response.read().decode("utf-8"))
def create_post(self, payload: dict[str, Any]) -> dict[str, Any]:
return self._request("POST", "/api/service/posts", payload)
def publish_post(self, slug: str) -> None:
self._request("POST", f"/api/service/posts/{slug}/publish")

View File

@@ -0,0 +1,95 @@
from __future__ import annotations
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timezone
from time import perf_counter
from typing import Callable, Iterable, Any
from .models import SourceConfig, SourceResult
Fetcher = Callable[[SourceConfig, str], list[dict[str, Any]]]
def _status_from_exception(exc: Exception) -> str:
if isinstance(exc, TimeoutError):
return "timeout"
return "error"
def _collect_one(config: SourceConfig, run_date: str, fetcher: Fetcher) -> SourceResult:
fetched_at = datetime.now(timezone.utc).isoformat()
if not config.enabled:
return SourceResult(
source=config.name,
role=config.role,
ok=False,
status="disabled",
fetched_at=fetched_at,
)
started = perf_counter()
try:
items = fetcher(config, run_date)
elapsed_ms = int((perf_counter() - started) * 1000)
status = "ok" if items else "empty"
return SourceResult(
source=config.name,
role=config.role,
ok=status == "ok",
status=status,
items=items,
elapsed_ms=elapsed_ms,
fetched_at=fetched_at,
)
except Exception as exc:
elapsed_ms = int((perf_counter() - started) * 1000)
return SourceResult(
source=config.name,
role=config.role,
ok=False,
status=_status_from_exception(exc),
error=f"{type(exc).__name__}: {exc}",
elapsed_ms=elapsed_ms,
fetched_at=fetched_at,
)
def collect_sources(
configs: Iterable[SourceConfig],
run_date: str,
*,
fetcher: Fetcher,
max_workers: int | None = None,
) -> tuple[list[SourceResult], dict[str, Any]]:
ordered_configs = list(configs)
if not ordered_configs:
return [], {
"input_source_count": 0,
"ok_source_count": 0,
"failed_source_count": 0,
"raw_item_count": 0,
}
workers = max_workers or min(8, len(ordered_configs))
result_by_name: dict[str, SourceResult] = {}
with ThreadPoolExecutor(max_workers=workers) as executor:
futures = {
executor.submit(_collect_one, config, run_date, fetcher): config
for config in ordered_configs
}
for future in as_completed(futures):
config = futures[future]
result_by_name[config.name] = future.result()
results = [result_by_name[config.name] for config in ordered_configs]
report = {
"input_source_count": len(results),
"ok_source_count": sum(1 for result in results if result.ok),
"failed_source_count": sum(1 for result in results if not result.ok),
"raw_item_count": sum(len(result.items) for result in results),
"source_counts": {result.source: len(result.items) for result in results},
"statuses": {result.source: result.status for result in results},
}
return results, report

19
ai_daily_report/config.py Normal file
View File

@@ -0,0 +1,19 @@
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
from .models import SourceConfig
from .pipeline import _source_config_from_dict
def load_json(path: Path) -> Any:
return json.loads(path.read_text(encoding="utf-8"))
def load_source_configs(path: Path) -> list[SourceConfig]:
raw = load_json(path)
if not isinstance(raw, list):
raise ValueError("sources config must be a list")
return [_source_config_from_dict(item) for item in raw]

100
ai_daily_report/dedupe.py Normal file
View File

@@ -0,0 +1,100 @@
from __future__ import annotations
import difflib
from typing import Any
from .models import NewsItem
def _item_score(item: NewsItem) -> int:
score = 0
score += max(0, 200 - item.source_priority)
if item.canonical_url:
score += 20
if item.summary_raw:
score += min(40, len(item.summary_raw))
if item.section_hint:
score += 10
if item.source_role == "primary":
score += 10
score -= len(item.quality_flags) * 10
return score
def _merge_group(group: list[NewsItem], reason: str) -> tuple[NewsItem, list[NewsItem], dict[str, Any]]:
keep = max(group, key=_item_score)
removed = [item for item in group if item is not keep]
for removed_item in removed:
keep.duplicate_sources.append(
{
"id": removed_item.id,
"source_group": removed_item.source_group,
"source_label": removed_item.source_label,
"url": removed_item.url,
"reason": reason,
}
)
report_group = {
"reason": reason,
"keep_id": keep.id,
"removed_ids": [item.id for item in removed],
"confidence": "high",
}
return keep, removed, report_group
def _group_by_key(items: list[NewsItem], key_name: str) -> dict[str, list[NewsItem]]:
groups: dict[str, list[NewsItem]] = {}
for item in items:
key = getattr(item, key_name)
if key:
groups.setdefault(key, []).append(item)
return {key: group for key, group in groups.items() if len(group) > 1}
def _possible_duplicates(items: list[NewsItem]) -> list[dict[str, Any]]:
possible: list[dict[str, Any]] = []
for index, left in enumerate(items):
for right in items[index + 1 :]:
if not left.title_norm or not right.title_norm:
continue
ratio = difflib.SequenceMatcher(None, left.title_norm, right.title_norm).ratio()
if ratio >= 0.65:
possible.append(
{
"item_ids": [left.id, right.id],
"reason": "title_similarity",
"similarity": round(ratio, 3),
"confidence": "medium",
}
)
return possible
def hard_dedup_items(items: list[NewsItem]) -> tuple[list[NewsItem], dict[str, Any]]:
remaining = list(items)
removed_object_ids: set[int] = set()
groups_report: list[dict[str, Any]] = []
for key_name, reason in (
("canonical_url", "same_canonical_url"),
("title_norm", "same_title_norm"),
):
grouped = _group_by_key([item for item in remaining if id(item) not in removed_object_ids], key_name)
for group in grouped.values():
active_group = [item for item in group if id(item) not in removed_object_ids]
if len(active_group) < 2:
continue
keep, removed, report_group = _merge_group(active_group, reason)
removed_object_ids.update(id(item) for item in removed)
groups_report.append(report_group)
deduped = [item for item in remaining if id(item) not in removed_object_ids]
report = {
"input_count": len(items),
"output_count": len(deduped),
"removed_count": len(removed_object_ids),
"groups": groups_report,
"possible_duplicates": _possible_duplicates(deduped),
}
return deduped, report

143
ai_daily_report/env.py Normal file
View File

@@ -0,0 +1,143 @@
from __future__ import annotations
import os
import json
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parents[1]
def read_env_file(env_path: Path) -> dict[str, str]:
env: dict[str, str] = {}
if not env_path.exists():
return env
text = env_path.read_text(encoding="utf-8", errors="ignore")
for line in text.splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, value = line.split("=", 1)
env[key.strip()] = value.strip().strip('"').strip("'")
return env
def load_env() -> dict[str, str]:
env: dict[str, str] = {}
env.update(read_env_file(PROJECT_ROOT / ".env"))
env.update(read_env_file(Path.home() / ".hermes" / ".env"))
env.update({key: value for key, value in os.environ.items() if value})
return env
def first_env(env: dict[str, str], *names: str) -> str:
for name in names:
value = (env.get(name) or "").strip()
if value:
return value
return ""
def _load_simple_yaml(path: Path) -> dict[str, object]:
if not path.exists():
return {}
root: dict[str, object] = {}
stack: list[tuple[int, dict[str, object]]] = [(-1, root)]
for raw_line in path.read_text(encoding="utf-8", errors="ignore").splitlines():
if not raw_line.strip() or raw_line.lstrip().startswith("#") or ":" not in raw_line:
continue
indent = len(raw_line) - len(raw_line.lstrip(" "))
key, value = raw_line.strip().split(":", 1)
key = key.strip()
value = value.strip().strip('"').strip("'")
while stack and indent <= stack[-1][0]:
stack.pop()
current = stack[-1][1]
if value:
current[key] = value
else:
child: dict[str, object] = {}
current[key] = child
stack.append((indent, child))
return root
def _env_with_hermes(env: dict[str, str], hermes_dir: Path) -> dict[str, str]:
merged = dict(read_env_file(hermes_dir / ".env"))
merged.update(env)
return merged
def _provider_env_names(provider: str) -> tuple[str, str, str]:
prefix = provider.upper().replace("-", "_")
return f"{prefix}_API_KEY", f"{prefix}_BASE_URL", f"{prefix}_MODEL"
def _auth_json_key(env: dict[str, str], hermes_dir: Path, provider: str) -> str:
auth_path = hermes_dir / "auth.json"
if not auth_path.exists() or not provider:
return ""
try:
auth = json.loads(auth_path.read_text(encoding="utf-8"))
except Exception:
return ""
pool = auth.get("credential_pool", {}) or {}
provider_keys = [provider, provider.replace("-", "_")]
for key in provider_keys:
creds = pool.get(key, []) or []
if not creds:
continue
cred = creds[0]
source = str(cred.get("source") or "")
if source.startswith("env:"):
resolved = first_env(env, source[4:])
if resolved:
return resolved
token = str(cred.get("access_token") or "").strip()
if token:
return token
return ""
def resolve_llm_config(env: dict[str, str], *, hermes_dir: Path | None = None) -> dict[str, str]:
hermes_dir = hermes_dir or Path.home() / ".hermes"
env = _env_with_hermes(env, hermes_dir)
hermes_config = _load_simple_yaml(hermes_dir / "config.yaml")
model_config = hermes_config.get("model", {}) if isinstance(hermes_config.get("model"), dict) else {}
provider = str(model_config.get("provider") or "").strip()
provider_key, provider_base_url, provider_model = _provider_env_names(provider) if provider else ("", "", "")
api_key = first_env(env, "LLM_API_KEY")
base_url = first_env(env, "LLM_BASE_URL")
model = first_env(env, "LLM_MODEL")
if not api_key and provider:
api_key = first_env(env, provider_key) or _auth_json_key(env, hermes_dir, provider)
if not base_url and provider:
base_url = first_env(env, provider_base_url) or str(model_config.get("base_url") or "").strip()
if not model and provider:
model = first_env(env, provider_model) or str(model_config.get("default") or "").strip()
if not api_key:
api_key = first_env(env, "SUB2API_API_KEY", "XIAOMI_API_KEY", "OPENROUTER_API_KEY")
if not base_url:
base_url = first_env(env, "SUB2API_BASE_URL", "XIAOMI_BASE_URL", "OPENROUTER_BASE_URL")
if not model:
model = first_env(env, "SUB2API_MODEL", "XIAOMI_MODEL")
missing = [
name
for name, value in (
("LLM_API_KEY", api_key),
("LLM_BASE_URL", base_url),
("LLM_MODEL", model),
)
if not value
]
if missing:
raise ValueError("missing_llm_config: " + ",".join(missing))
return {"api_key": api_key, "base_url": base_url, "model": model}
def resolve_blog_token(env: dict[str, str]) -> str:
return first_env(env, "BLOG_SERVICE_TOKEN", "EPHRON_SERVICE_TOKEN")

113
ai_daily_report/guide.py Normal file
View File

@@ -0,0 +1,113 @@
from __future__ import annotations
import json
import re
from typing import Any, Callable
from .llm import parse_json_object
from .models import NewsItem
GuideLlmCall = Callable[[str], str]
def _clean_text(text: str, limit: int | None = None) -> str:
value = re.sub(r"^\s*>\s*", "", text or "").strip()
value = re.sub(r"\[\d+\]|\[N\]", "", value)
value = re.sub(r"\s+", " ", value).strip()
if limit and len(value) > limit:
value = value[:limit].rstrip()
return value
def _build_prompt(items: list[NewsItem]) -> str:
payload = {
"task": (
"Generate a concise AI daily report guide. Return JSON only. Do not use 强信号/中信号/待验证. "
"Use a short theme and 2-4 daily threads. Every thread must reference existing item_ids."
),
"items": [
{
"id": item.id,
"title": item.title or item.title_raw,
"summary": item.summary or item.summary_raw,
"section": item.section,
"source": item.source_label,
}
for item in items
],
"output_schema": {
"theme": "one sentence under 120 Chinese characters",
"threads": [
{
"title": "thread title",
"text": "one or two sentences",
"item_ids": ["existing item id"],
"kind": "thread|uncertain",
}
],
},
}
return json.dumps(payload, ensure_ascii=False)
def generate_guide(
items: list[NewsItem],
*,
llm_call: GuideLlmCall,
) -> tuple[dict[str, Any], dict[str, Any]]:
if not items:
return {
"theme": "",
"threads": [],
}, {
"input_count": 0,
"theme_present": False,
"thread_count": 0,
"dropped_thread_count": 0,
"fallback_used": False,
"errors": [],
}
try:
obj = parse_json_object(llm_call(_build_prompt(items)))
except Exception as exc:
return {
"theme": "",
"threads": [],
}, {
"input_count": len(items),
"theme_present": False,
"thread_count": 0,
"dropped_thread_count": 0,
"fallback_used": True,
"errors": [f"{type(exc).__name__}: {exc}"],
}
valid_ids = {item.id for item in items}
threads: list[dict[str, Any]] = []
dropped = 0
for thread in obj.get("threads", []) or []:
item_ids = [item_id for item_id in thread.get("item_ids", []) if item_id in valid_ids]
if not item_ids:
dropped += 1
continue
title = _clean_text(str(thread.get("title") or ""), limit=80)
text = _clean_text(str(thread.get("text") or ""), limit=220)
if not title or not text:
dropped += 1
continue
kind = thread.get("kind") if thread.get("kind") in ("thread", "uncertain") else "thread"
threads.append({"title": title, "text": text, "item_ids": item_ids, "kind": kind})
theme = _clean_text(str(obj.get("theme") or ""), limit=120)
guide = {"theme": theme, "threads": threads}
report = {
"input_count": len(items),
"theme_present": bool(theme),
"thread_count": len(threads),
"dropped_thread_count": dropped,
"fallback_used": False,
"errors": [],
}
return guide, report

18
ai_daily_report/llm.py Normal file
View File

@@ -0,0 +1,18 @@
from __future__ import annotations
import json
import re
from typing import Any, Callable
LlmCall = Callable[[str], str]
def parse_json_object(text: str) -> dict[str, Any]:
text = re.sub(r"^```(?:json)?\s*\n?", "", text.strip())
text = re.sub(r"\n?```\s*$", "", text)
match = re.search(r"\{.*\}\s*$", text, re.S)
if not match:
raise ValueError("LLM output does not contain a JSON object")
return json.loads(match.group(0))

53
ai_daily_report/models.py Normal file
View File

@@ -0,0 +1,53 @@
from dataclasses import dataclass, field
from typing import Any
@dataclass(frozen=True)
class SourceConfig:
name: str
type: str
role: str = "supplement"
priority: int = 100
required: bool = False
enabled: bool = True
timeout_seconds: int = 25
retries: int = 0
min_items: int = 0
url: str = ""
@dataclass
class SourceResult:
source: str
role: str
ok: bool
status: str
items: list[dict[str, Any]] = field(default_factory=list)
error: str | None = None
elapsed_ms: int = 0
retry_count: int = 0
fetched_at: str = ""
@dataclass
class NewsItem:
id: str
source_group: str
source_label: str
source_role: str
source_priority: int
title_raw: str
title_norm: str
summary_raw: str
url: str
canonical_url: str
published_at: str | None = None
collected_at: str = ""
origin_type: str = ""
section_hint: str = ""
language_hint: str = ""
title: str | None = None
summary: str | None = None
section: str | None = None
quality_flags: list[str] = field(default_factory=list)
duplicate_sources: list[dict[str, Any]] = field(default_factory=list)

View File

@@ -0,0 +1,132 @@
from __future__ import annotations
import hashlib
import html
import re
import unicodedata
from collections import Counter
from datetime import datetime, timezone
from typing import Any
from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
from .models import NewsItem, SourceResult
TRACKING_QUERY_PREFIXES = ("utm_",)
TRACKING_QUERY_KEYS = {"fbclid", "gclid", "spm", "from", "ref"}
def clean_text(value: str) -> str:
text = html.unescape(value or "")
text = re.sub(r"<[^>]+>", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def canonicalize_url(url: str) -> str:
if not url:
return ""
parsed = urlparse(url.strip())
scheme = (parsed.scheme or "https").lower()
host = (parsed.netloc or "").lower()
if host.startswith("www."):
host = host[4:]
if host == "twitter.com":
host = "x.com"
query = []
for key, value in parse_qsl(parsed.query, keep_blank_values=True):
key_lower = key.lower()
if key_lower in TRACKING_QUERY_KEYS:
continue
if any(key_lower.startswith(prefix) for prefix in TRACKING_QUERY_PREFIXES):
continue
query.append((key, value))
path = parsed.path or ""
if len(path) > 1:
path = path.rstrip("/")
return urlunparse((scheme, host, path, "", urlencode(query), ""))
def normalize_title(title: str) -> str:
text = unicodedata.normalize("NFKC", title or "").lower()
text = re.sub(r"[^\w\u4e00-\u9fff]+", "", text)
return text
def _item_id(canonical_url: str, source_group: str, title_norm: str, published_at: str | None) -> str:
seed = canonical_url or "|".join([source_group, title_norm, published_at or ""])
digest = hashlib.sha1(seed.encode("utf-8")).hexdigest()[:16]
return f"item_{digest}"
def _quality_flags(title: str, summary: str, url: str) -> list[str]:
flags: list[str] = []
if not url:
flags.append("missing_url")
if not summary:
flags.append("missing_summary")
if len(normalize_title(title)) < 3:
flags.append("short_title")
return flags
def normalize_items(
source_results: list[SourceResult],
*,
run_date: str,
source_priorities: dict[str, int] | None = None,
) -> tuple[list[NewsItem], dict[str, Any]]:
source_priorities = source_priorities or {}
collected_at = datetime.now(timezone.utc).isoformat()
items: list[NewsItem] = []
flag_counts: Counter[str] = Counter()
id_counts: Counter[str] = Counter()
input_count = 0
for source_result in source_results:
for raw in source_result.items:
input_count += 1
title = clean_text(str(raw.get("title_raw") or raw.get("title") or ""))
summary = clean_text(str(raw.get("summary_raw") or raw.get("summary") or ""))
url = str(raw.get("url") or "").strip()
canonical_url = canonicalize_url(url)
title_norm = normalize_title(title)
flags = _quality_flags(title, summary, canonical_url)
flag_counts.update(flags)
source_label = clean_text(str(raw.get("source_label") or source_result.source))
published_at = raw.get("published_at")
base_id = _item_id(canonical_url, source_result.source, title_norm, published_at)
id_counts[base_id] += 1
item_id = base_id if id_counts[base_id] == 1 else f"{base_id}_{id_counts[base_id]}"
items.append(
NewsItem(
id=item_id,
source_group=source_result.source,
source_label=source_label,
source_role=source_result.role,
source_priority=source_priorities.get(source_result.source, 100),
title_raw=title,
title_norm=title_norm,
summary_raw=summary,
url=url,
canonical_url=canonical_url,
published_at=published_at,
collected_at=collected_at,
origin_type=str(raw.get("origin_type") or ""),
section_hint=str(raw.get("section_hint") or ""),
language_hint=str(raw.get("language_hint") or ""),
quality_flags=flags,
)
)
report = {
"run_date": run_date,
"input_count": input_count,
"output_count": len(items),
"quality_flag_counts": dict(flag_counts),
}
return items, report

219
ai_daily_report/pipeline.py Normal file
View File

@@ -0,0 +1,219 @@
from __future__ import annotations
from typing import Any
from .assemble import assemble_markdown
from .classify import classify_and_order_items
from .collect import Fetcher, collect_sources
from .dedupe import hard_dedup_items
from .guide import GuideLlmCall, generate_guide
from .models import SourceConfig
from .normalize import normalize_items
from .publish import BlogClient, publish_markdown
from .rewrite import RewriteLlmCall, rewrite_items
from .semantic_dedupe import SemanticLlmCall, semantic_dedup_items
def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig:
return SourceConfig(
name=value["name"],
type=value["type"],
role=value.get("role", "supplement"),
priority=int(value.get("priority", 100)),
required=bool(value.get("required", False)),
enabled=bool(value.get("enabled", True)),
timeout_seconds=int(value.get("timeout_seconds", 25)),
retries=int(value.get("retries", 0)),
min_items=int(value.get("min_items", 0)),
url=value.get("url", ""),
)
def run_stage0_to_stage2(
source_configs: list[dict[str, Any] | SourceConfig],
run_date: str,
*,
fetcher: Fetcher,
) -> dict[str, Any]:
configs = [
config if isinstance(config, SourceConfig) else _source_config_from_dict(config)
for config in source_configs
]
source_results, stage0_report = collect_sources(configs, run_date, fetcher=fetcher)
source_priorities = {config.name: config.priority for config in configs}
normalized_items, stage1_report = normalize_items(
source_results,
run_date=run_date,
source_priorities=source_priorities,
)
deduped_items, stage2_report = hard_dedup_items(normalized_items)
return {
"source_results": source_results,
"items": deduped_items,
"reports": {
"stage0": stage0_report,
"stage1": stage1_report,
"stage2": stage2_report,
},
}
def run_stage0_to_stage4(
source_configs: list[dict[str, Any] | SourceConfig],
run_date: str,
*,
fetcher: Fetcher,
semantic_llm_call: SemanticLlmCall,
rewrite_llm_call: RewriteLlmCall,
) -> dict[str, Any]:
stage2_result = run_stage0_to_stage2(source_configs, run_date, fetcher=fetcher)
items = stage2_result["items"]
candidates = stage2_result["reports"]["stage2"].get("possible_duplicates", [])
semantic_items, stage3_report = semantic_dedup_items(
items,
candidates,
llm_call=semantic_llm_call,
)
rewritten_items, stage4_report = rewrite_items(
semantic_items,
llm_call=rewrite_llm_call,
)
reports = dict(stage2_result["reports"])
reports["stage3"] = stage3_report
reports["stage4"] = stage4_report
return {
"source_results": stage2_result["source_results"],
"items": rewritten_items,
"reports": reports,
}
def run_stage0_to_stage5(
source_configs: list[dict[str, Any] | SourceConfig],
run_date: str,
*,
fetcher: Fetcher,
semantic_llm_call: SemanticLlmCall,
rewrite_llm_call: RewriteLlmCall,
) -> dict[str, Any]:
stage4_result = run_stage0_to_stage4(
source_configs,
run_date,
fetcher=fetcher,
semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call,
)
classified_items, stage5_report = classify_and_order_items(stage4_result["items"])
reports = dict(stage4_result["reports"])
reports["stage5"] = stage5_report
return {
"source_results": stage4_result["source_results"],
"items": classified_items,
"reports": reports,
}
def run_stage0_to_stage6(
source_configs: list[dict[str, Any] | SourceConfig],
run_date: str,
*,
fetcher: Fetcher,
semantic_llm_call: SemanticLlmCall,
rewrite_llm_call: RewriteLlmCall,
guide_llm_call: GuideLlmCall,
) -> dict[str, Any]:
stage5_result = run_stage0_to_stage5(
source_configs,
run_date,
fetcher=fetcher,
semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call,
)
guide, stage6_report = generate_guide(stage5_result["items"], llm_call=guide_llm_call)
reports = dict(stage5_result["reports"])
reports["stage6"] = stage6_report
return {
"source_results": stage5_result["source_results"],
"items": stage5_result["items"],
"guide": guide,
"reports": reports,
}
def run_stage0_to_stage7(
source_configs: list[dict[str, Any] | SourceConfig],
run_date: str,
*,
fetcher: Fetcher,
semantic_llm_call: SemanticLlmCall,
rewrite_llm_call: RewriteLlmCall,
guide_llm_call: GuideLlmCall,
) -> dict[str, Any]:
stage6_result = run_stage0_to_stage6(
source_configs,
run_date,
fetcher=fetcher,
semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call,
guide_llm_call=guide_llm_call,
)
markdown, stage7_report = assemble_markdown(stage6_result["items"], stage6_result["guide"])
reports = dict(stage6_result["reports"])
reports["stage7"] = stage7_report
return {
"source_results": stage6_result["source_results"],
"items": stage6_result["items"],
"guide": stage6_result["guide"],
"markdown": markdown,
"reports": reports,
}
def run_stage0_to_stage8(
source_configs: list[dict[str, Any] | SourceConfig],
run_date: str,
*,
fetcher: Fetcher,
semantic_llm_call: SemanticLlmCall,
rewrite_llm_call: RewriteLlmCall,
guide_llm_call: GuideLlmCall,
mode: str,
base_url: str,
client: BlogClient | None,
) -> dict[str, Any]:
stage7_result = run_stage0_to_stage7(
source_configs,
run_date,
fetcher=fetcher,
semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call,
guide_llm_call=guide_llm_call,
)
slug = f"ai-{run_date}"
publish_result = publish_markdown(
title=f"AI日报 · {run_date}",
markdown=stage7_result["markdown"],
tags=["AI日报", "AI资讯", "人工智能"],
slug=slug,
base_url=base_url,
mode=mode,
markdown_report=stage7_result["reports"]["stage7"],
client=client,
)
reports = dict(stage7_result["reports"])
reports["stage8"] = {
"mode": publish_result.mode,
"status": publish_result.status,
"slug": publish_result.slug,
"blog_url": publish_result.blog_url,
"public_ok": publish_result.public_ok,
"error": publish_result.error,
}
return {
"source_results": stage7_result["source_results"],
"items": stage7_result["items"],
"guide": stage7_result["guide"],
"markdown": stage7_result["markdown"],
"publish": publish_result,
"reports": reports,
}

View File

@@ -0,0 +1,90 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Protocol
@dataclass
class PublishResult:
mode: str
status: str
slug: str
blog_url: str
public_ok: bool = False
error: str | None = None
class BlogClient(Protocol):
def create_post(self, payload: dict[str, Any]) -> dict[str, Any]:
...
def publish_post(self, slug: str) -> None:
...
def dry_run_publish(slug: str, base_url: str) -> PublishResult:
return PublishResult(
mode="dry-run",
status="ok",
slug=slug,
blog_url=f"{base_url.rstrip('/')}/posts/{slug}",
public_ok=True,
)
def publish_markdown(
*,
title: str,
markdown: str,
tags: list[str],
slug: str,
base_url: str,
mode: str,
markdown_report: dict[str, Any],
client: BlogClient | None,
) -> PublishResult:
blocking_errors = markdown_report.get("blocking_errors", []) or []
blog_url = f"{base_url.rstrip('/')}/posts/{slug}"
if blocking_errors:
return PublishResult(
mode=mode,
status="blocked",
slug=slug,
blog_url=blog_url,
public_ok=False,
error=";".join(blocking_errors),
)
if mode == "dry-run":
return dry_run_publish(slug, base_url)
if client is None:
return PublishResult(
mode=mode,
status="failed",
slug=slug,
blog_url=blog_url,
public_ok=False,
error="missing_blog_client",
)
payload = {"title": title, "content": markdown, "tags": tags, "slug": slug}
try:
create_resp = client.create_post(payload)
created_slug = create_resp.get("slug") or slug
if mode == "publish":
client.publish_post(created_slug)
return PublishResult(
mode=mode,
status="ok",
slug=created_slug,
blog_url=f"{base_url.rstrip('/')}/posts/{created_slug}",
public_ok=mode == "publish",
)
except Exception as exc:
return PublishResult(
mode=mode,
status="failed",
slug=slug,
blog_url=blog_url,
public_ok=False,
error=f"{type(exc).__name__}: {exc}",
)

103
ai_daily_report/rewrite.py Normal file
View File

@@ -0,0 +1,103 @@
from __future__ import annotations
import json
from typing import Any, Callable
from .llm import parse_json_object
from .models import NewsItem
RewriteLlmCall = Callable[[str], str]
def _chunks(items: list[NewsItem], size: int) -> list[list[NewsItem]]:
return [items[index : index + size] for index in range(0, len(items), size)]
def _build_prompt(batch: list[NewsItem]) -> str:
payload = {
"task": (
"Rewrite AI news titles and summaries into concise Chinese. Preserve brand/model/API names "
"such as GPT-5, Codex, Gemini, Claude, API, MCP. Do not add facts."
),
"items": [
{
"id": item.id,
"title_raw": item.title_raw,
"summary_raw": item.summary_raw,
"source": item.source_label,
"language_hint": item.language_hint,
}
for item in batch
],
"output_schema": {
"rewrites": [
{
"id": "item id",
"title": "display title",
"summary": "display summary",
"flags": [],
}
]
},
}
return json.dumps(payload, ensure_ascii=False)
def _fallback(item: NewsItem) -> None:
item.title = item.title_raw
item.summary = item.summary_raw or "该条目暂无摘要。"
def _apply_rewrite_batch(batch: list[NewsItem], llm_call: RewriteLlmCall) -> int:
obj = parse_json_object(llm_call(_build_prompt(batch)))
rewrites = obj.get("rewrites", [])
if not isinstance(rewrites, list):
raise ValueError("rewrites is not a list")
by_id = {item.id: item for item in batch}
seen_ids: set[str] = set()
for entry in rewrites:
item_id = entry.get("id")
title = str(entry.get("title") or "").strip()
summary = str(entry.get("summary") or "").strip()
if item_id in by_id and title and summary:
by_id[item_id].title = title
by_id[item_id].summary = summary
seen_ids.add(item_id)
for item in batch:
if item.id not in seen_ids:
raise ValueError(f"missing_rewrite_for_item: {item.id}")
return len(seen_ids)
def rewrite_items(
items: list[NewsItem],
*,
llm_call: RewriteLlmCall,
batch_size: int = 10,
) -> tuple[list[NewsItem], dict[str, Any]]:
rewritten_count = 0
fallback_count = 0
errors: list[str] = []
for batch in _chunks(items, max(1, batch_size)):
try:
rewritten_count += _apply_rewrite_batch(batch, llm_call)
except Exception as exc:
errors.append(f"batch:{type(exc).__name__}: {exc}")
for item in batch:
try:
rewritten_count += _apply_rewrite_batch([item], llm_call)
except Exception as item_exc:
errors.append(f"item:{item.id}:{type(item_exc).__name__}: {item_exc}")
_fallback(item)
fallback_count += 1
report = {
"input_count": len(items),
"rewritten_count": rewritten_count,
"fallback_count": fallback_count,
"batch_count": len(_chunks(items, max(1, batch_size))),
"errors": errors,
}
return items, report

156
ai_daily_report/runner.py Normal file
View File

@@ -0,0 +1,156 @@
from __future__ import annotations
import json
from dataclasses import asdict, is_dataclass
from pathlib import Path
from typing import Any
from .clients import BlogApiClient, OpenAICompatibleClient, fetch_text as default_fetch_text
from .config import load_source_configs
from .env import load_env, resolve_blog_token, resolve_llm_config
from .models import SourceConfig
from .pipeline import run_stage0_to_stage8
from .sources.registry import get_source_fetcher
def _json_default(value: Any):
if is_dataclass(value):
return asdict(value)
raise TypeError(f"Object is not JSON serializable: {type(value).__name__}")
def _mock_source_configs() -> list[SourceConfig]:
return [SourceConfig(name="Mock AI HOT", type="mock", role="primary", priority=10)]
def _mock_fetcher(config: SourceConfig, run_date: str) -> list[dict[str, Any]]:
return [
{
"title_raw": "GPT-5 API 发布",
"summary_raw": "OpenAI 发布 GPT-5 API用于本地 mock 测试。",
"url": "https://example.com/gpt5",
"source_label": "OpenAIBlog",
"section_hint": "模型发布/更新",
"origin_type": "mock",
"language_hint": "zh",
}
]
def _mock_semantic_llm(prompt: str) -> str:
return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []}, ensure_ascii=False)
def _mock_rewrite_llm(prompt: str) -> str:
payload = json.loads(prompt)
return json.dumps(
{
"rewrites": [
{
"id": item["id"],
"title": item["title_raw"],
"summary": item["summary_raw"],
"flags": [],
}
for item in payload["items"]
]
},
ensure_ascii=False,
)
def _mock_guide_llm(prompt: str) -> str:
payload = json.loads(prompt)
item_ids = [item["id"] for item in payload["items"][:3]]
return json.dumps(
{
"theme": "本地 mock 模式已生成 AI 日报,用于验证流水线。",
"threads": [
{
"title": "本地链路验证",
"text": "采集、改写、分类、导览、Markdown 和发布报告都已通过 mock 数据串联。",
"item_ids": item_ids,
"kind": "thread",
}
],
},
ensure_ascii=False,
)
def run_daily_report(
*,
run_date: str,
mode: str,
source_mode: str,
llm_mode: str,
out_dir: Path,
base_url: str,
sources_path: Path | None = None,
fetch_text=None,
env: dict[str, str] | None = None,
llm_client_factory=OpenAICompatibleClient,
blog_client_factory=BlogApiClient,
) -> dict[str, Any]:
fetch_text = fetch_text or default_fetch_text
env = env if env is not None else load_env()
if source_mode == "mock":
source_configs = _mock_source_configs()
fetcher = _mock_fetcher
elif source_mode == "live":
if sources_path is None:
sources_path = Path("config") / "sources.json"
source_configs = load_source_configs(sources_path)
def fetcher(config: SourceConfig, current_date: str) -> list[dict[str, Any]]:
source_fetcher = get_source_fetcher(config.type)
return source_fetcher(config, current_date, fetch_text)
else:
raise ValueError("source_mode must be 'mock' or 'live'")
if llm_mode == "mock":
semantic_llm_call = _mock_semantic_llm
rewrite_llm_call = _mock_rewrite_llm
guide_llm_call = _mock_guide_llm
elif llm_mode == "live":
llm_client = llm_client_factory(**resolve_llm_config(env))
semantic_llm_call = llm_client.chat
rewrite_llm_call = llm_client.chat
guide_llm_call = llm_client.chat
else:
raise ValueError("llm_mode must be 'mock' or 'live'")
blog_client = None
if mode in ("draft", "publish"):
token = resolve_blog_token(env)
if not token:
raise ValueError("missing_blog_token: set BLOG_SERVICE_TOKEN or EPHRON_SERVICE_TOKEN")
blog_client = blog_client_factory(base_url=base_url, token=token)
result = run_stage0_to_stage8(
source_configs,
run_date,
fetcher=fetcher,
semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call,
guide_llm_call=guide_llm_call,
mode=mode,
base_url=base_url,
client=blog_client,
)
run_dir = out_dir / run_date
run_dir.mkdir(parents=True, exist_ok=True)
(run_dir / "blog_markdown.md").write_text(result["markdown"], encoding="utf-8")
(run_dir / "run_report.json").write_text(
json.dumps(result["reports"], ensure_ascii=False, indent=2, default=_json_default),
encoding="utf-8",
)
return {
"run_dir": str(run_dir),
"markdown": result["markdown"],
"reports": result["reports"],
"publish": result["publish"],
}

View File

@@ -0,0 +1,167 @@
from __future__ import annotations
import json
from typing import Any, Callable
from .llm import parse_json_object
from .models import NewsItem
SemanticLlmCall = Callable[[str], str]
def _build_prompt(items: list[NewsItem], candidates: list[dict[str, Any]]) -> str:
item_payload = [
{
"id": item.id,
"title": item.title or item.title_raw,
"summary": item.summary or item.summary_raw,
"source": item.source_label,
"section_hint": item.section_hint,
}
for item in items
]
prompt = {
"task": "Identify only high-confidence semantic duplicates. Do not curate or remove by importance.",
"items": item_payload,
"candidates": candidates,
"output_schema": {
"duplicate_groups": [
{
"keep_id": "item id",
"remove_ids": ["item id"],
"confidence": "high|medium|low",
"reason": "same concrete event reason",
}
],
"not_duplicates": [],
"uncertain": [],
},
}
return json.dumps(prompt, ensure_ascii=False)
def _score(item: NewsItem) -> int:
score = max(0, 200 - item.source_priority)
if item.source_role == "primary":
score += 10
if item.summary_raw:
score += min(40, len(item.summary_raw))
if item.canonical_url:
score += 20
score -= len(item.quality_flags) * 10
return score
def _choose_keep(group_items: list[NewsItem], suggested_keep_id: str) -> NewsItem:
suggested = [item for item in group_items if item.id == suggested_keep_id]
if suggested:
best = max(group_items, key=_score)
if _score(suggested[0]) >= _score(best) - 10:
return suggested[0]
return max(group_items, key=_score)
def semantic_dedup_items(
items: list[NewsItem],
candidates: list[dict[str, Any]],
*,
llm_call: SemanticLlmCall,
max_deletion_ratio: float = 0.5,
) -> tuple[list[NewsItem], dict[str, Any]]:
if not items or not candidates:
return items, {
"input_count": len(items),
"candidate_group_count": len(candidates),
"removed_count": 0,
"duplicate_groups": [],
"uncertain": [],
"errors": [],
"skipped_for_deletion_ratio": False,
}
errors: list[str] = []
try:
obj = parse_json_object(llm_call(_build_prompt(items, candidates)))
except Exception as exc:
return items, {
"input_count": len(items),
"candidate_group_count": len(candidates),
"removed_count": 0,
"duplicate_groups": [],
"uncertain": [],
"errors": [f"{type(exc).__name__}: {exc}"],
"skipped_for_deletion_ratio": False,
}
by_id = {item.id: item for item in items}
candidate_sets = {
frozenset(item_id for item_id in candidate.get("item_ids", []) if isinstance(item_id, str))
for candidate in candidates
}
candidate_removals: set[str] = set()
valid_groups: list[dict[str, Any]] = []
for group in obj.get("duplicate_groups", []) or []:
if group.get("confidence") != "high":
continue
ids = [group.get("keep_id")] + list(group.get("remove_ids") or [])
if any(not isinstance(item_id, str) or item_id not in by_id for item_id in ids):
errors.append(f"invalid_ids_in_group: {group}")
continue
group_set = frozenset(ids)
if not any(group_set.issubset(candidate_set) for candidate_set in candidate_sets):
errors.append(f"group_outside_candidates: {group}")
continue
group_items = [by_id[item_id] for item_id in ids]
keep = _choose_keep(group_items, str(group.get("keep_id")))
remove_items = [item for item in group_items if item is not keep]
candidate_removals.update(item.id for item in remove_items)
valid_groups.append(
{
"keep_id": keep.id,
"remove_ids": [item.id for item in remove_items],
"confidence": "high",
"reason": str(group.get("reason") or "semantic_duplicate"),
}
)
deletion_ratio = len(candidate_removals) / len(items) if items else 0
if deletion_ratio > max_deletion_ratio:
return items, {
"input_count": len(items),
"candidate_group_count": len(candidates),
"removed_count": 0,
"duplicate_groups": valid_groups,
"uncertain": obj.get("uncertain", []) or [],
"errors": errors,
"skipped_for_deletion_ratio": True,
}
removed_ids: set[str] = set()
for group in valid_groups:
keep = by_id[group["keep_id"]]
for remove_id in group["remove_ids"]:
removed = by_id[remove_id]
keep.duplicate_sources.append(
{
"id": removed.id,
"source_group": removed.source_group,
"source_label": removed.source_label,
"url": removed.url,
"reason": group["reason"],
}
)
removed_ids.add(remove_id)
deduped = [item for item in items if item.id not in removed_ids]
report = {
"input_count": len(items),
"candidate_group_count": len(candidates),
"removed_count": len(removed_ids),
"duplicate_groups": valid_groups,
"uncertain": obj.get("uncertain", []) or [],
"errors": errors,
"skipped_for_deletion_ratio": False,
}
return deduped, report

View File

@@ -0,0 +1,2 @@
"""Source adapters for the AI daily report pipeline."""

View File

@@ -0,0 +1,32 @@
from __future__ import annotations
import json
from typing import Any, Callable
from ai_daily_report.models import SourceConfig
FetchText = Callable[[str, int], str]
def fetch_aihot(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
data = json.loads(fetch_text(f"https://aihot.virxact.com/api/public/daily/{run_date}", config.timeout_seconds))
items: list[dict[str, Any]] = []
generated = data.get("generatedAt")
for section in data.get("sections", []) or []:
for raw in section.get("items", []) or []:
items.append(
{
"source_group": config.name,
"source_label": raw.get("sourceName") or config.name,
"title_raw": raw.get("title") or "",
"summary_raw": raw.get("summary") or "",
"url": raw.get("sourceUrl") or "",
"published_at": generated,
"origin_type": "aihot_json",
"section_hint": section.get("label") or "",
"language_hint": "zh",
}
)
return items

View File

@@ -0,0 +1,58 @@
from __future__ import annotations
import re
import xml.etree.ElementTree as ET
from typing import Any, Callable
from ai_daily_report.models import SourceConfig
from ai_daily_report.normalize import clean_text
from ai_daily_report.sources.labels import source_label_from_url
FetchText = Callable[[str, int], str]
def parse_juya_rss(config: SourceConfig, xml_text: str, run_date: str) -> list[dict[str, Any]]:
root = ET.fromstring(xml_text)
channel = root.find("channel")
raw_items = channel.findall("item") if channel is not None else []
article_html = ""
for raw in raw_items:
if (raw.findtext("title") or "").strip() != run_date:
continue
content_el = raw.find("{http://purl.org/rss/1.0/modules/content/}encoded")
article_html = content_el.text if content_el is not None and content_el.text else ""
break
if not article_html:
return []
block_pattern = re.compile(
r'<h2[^>]*>\s*(?:<a[^>]*href="(?P<title_url>[^"]+)"[^>]*>)?(?P<title_html>[^<]*?)</a>?\s*<code>#(?P<num>\d+)</code>\s*</h2>(?P<body>.*?)(?=<hr\s*/?>\s*<h2|<p><strong>提示</strong>|$)',
re.S | re.I,
)
items: list[dict[str, Any]] = []
for match in block_pattern.finditer(article_html):
title = clean_text(match.group("title_html") or "")
body_html = match.group("body") or ""
links = re.findall(r'<a[^>]*href="([^"]+)"[^>]*>', body_html, re.I)
url = links[0].replace("&amp;", "&").strip() if links else (match.group("title_url") or "")
summary = clean_text(re.sub(r"<[^>]+>", " ", body_html))
if title:
items.append(
{
"source_group": config.name,
"source_label": source_label_from_url(url, fallback=config.name),
"title_raw": title,
"summary_raw": summary[:500],
"url": url,
"published_at": None,
"origin_type": "juya_issue",
"section_hint": "",
"language_hint": "zh",
}
)
return items
def fetch_juya(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
return parse_juya_rss(config, fetch_text(config.url, config.timeout_seconds), run_date)

View File

@@ -0,0 +1,78 @@
from __future__ import annotations
from urllib.parse import urlparse
DOMAIN_LABELS = {
"anthropic.com": "Anthropic",
"arxiv.org": "arXiv",
"bloomberg.com": "Bloomberg",
"deepseek.com": "DeepSeek",
"github.blog": "GitHub Blog",
"github.com": "GitHub",
"huggingface.co": "Hugging Face",
"infoq.com": "InfoQ",
"mp.weixin.qq.com": "微信公众号",
"openai.com": "OpenAI",
"platform.minimaxi.com": "MiniMaxDocs",
"qbitai.com": "量子位",
"techcrunch.com": "TechCrunch",
"technologyreview.com": "MIT科技评论AI",
"theverge.com": "The Verge",
"x.com": "X",
"twitter.com": "X",
}
X_DISPLAY_NAMES = {
"MiniMax_AI": "MiniMax",
"OpenAIDevs": "OpenAI Developers",
"openai": "OpenAI",
"openclaw": "OpenClaw",
"xai": "xAI",
"krea_ai": "Krea AI",
"nvidia": "NVIDIA",
"NVIDIAAI": "NVIDIA AI",
"alibaba_cloud": "阿里云 / Alibaba Cloud",
"cb_doge": "cb_doge",
}
def _host(url: str) -> str:
host = (urlparse(url).netloc or "").lower()
return host[4:] if host.startswith("www.") else host
def _domain_label(host: str) -> str:
for domain, label in DOMAIN_LABELS.items():
if host == domain or host.endswith("." + domain):
return label
return host
def _x_handle(url: str) -> str:
parts = [part for part in urlparse(url).path.split("/") if part]
if not parts:
return ""
handle = parts[0]
if handle in {"i", "search", "explore", "settings", "notifications", "home", "compose"}:
return ""
return handle
def source_label_from_url(url: str, *, fallback: str = "来源") -> str:
if not url:
return fallback
host = _host(url)
if host in {"x.com", "twitter.com"}:
handle = _x_handle(url)
if handle:
display = X_DISPLAY_NAMES.get(handle, handle)
return f"X{display} (@{handle})"
return "X"
label = _domain_label(host)
parsed = urlparse(url)
path = (parsed.path or "").lower()
if label and ("blog" in host or "/blog" in path or "/research" in path):
return f"{label}Blog"
return label or fallback

View File

@@ -0,0 +1,24 @@
from __future__ import annotations
from typing import Callable
from ai_daily_report.models import SourceConfig
from ai_daily_report.sources.aihot import fetch_aihot
from ai_daily_report.sources.juya import fetch_juya
from ai_daily_report.sources.rss import fetch_rss
SourceFetcher = Callable[[SourceConfig, str, Callable[[str, int], str]], list[dict]]
SOURCE_FETCHERS: dict[str, SourceFetcher] = {
"aihot": fetch_aihot,
"rss": fetch_rss,
"juya_rss": fetch_juya,
}
def get_source_fetcher(source_type: str) -> SourceFetcher:
if source_type not in SOURCE_FETCHERS:
raise KeyError(f"Unknown source type: {source_type}")
return SOURCE_FETCHERS[source_type]

View File

@@ -0,0 +1,51 @@
from __future__ import annotations
import xml.etree.ElementTree as ET
from email.utils import parsedate_to_datetime
from typing import Any, Callable
from ai_daily_report.models import SourceConfig
from ai_daily_report.normalize import clean_text
FetchText = Callable[[str, int], str]
def _parse_pubdate(value: str) -> str | None:
if not value:
return None
try:
return parsedate_to_datetime(value).isoformat()
except Exception:
return None
def parse_rss_items(config: SourceConfig, xml_text: str, *, limit: int = 20) -> list[dict[str, Any]]:
root = ET.fromstring(xml_text)
channel = root.find("channel")
raw_items = channel.findall("item") if channel is not None else []
items: list[dict[str, Any]] = []
for raw in raw_items[:limit]:
title = clean_text(raw.findtext("title") or "")
if not title:
continue
summary = clean_text(raw.findtext("description") or "")
items.append(
{
"source_group": config.name,
"source_label": config.name,
"title_raw": title,
"summary_raw": summary,
"url": (raw.findtext("link") or "").strip(),
"published_at": _parse_pubdate(raw.findtext("pubDate") or ""),
"origin_type": "rss",
"section_hint": "",
"language_hint": "en" if title.encode("utf-8").isascii() else "zh",
}
)
return items
def fetch_rss(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
return parse_rss_items(config, fetch_text(config.url, config.timeout_seconds))

View File

@@ -0,0 +1,46 @@
from __future__ import annotations
import re
from typing import Any
from .classify import SECTION_ORDER
from .models import NewsItem
def validate_report_markdown(markdown: str, items: list[NewsItem]) -> dict[str, Any]:
return validate_markdown(markdown, items)
def validate_markdown(markdown: str, items: list[NewsItem]) -> dict[str, Any]:
blocking_errors: list[str] = []
auto_fixes: list[str] = []
warnings: list[dict[str, str]] = []
if not items:
blocking_errors.append("no_items")
if len((markdown or "").strip()) < 80:
blocking_errors.append("markdown_too_short")
if items and "## " not in markdown:
blocking_errors.append("no_sections")
if re.search(r"\{[^{}]*\}", markdown or ""):
blocking_errors.append("json_fragment_detected")
if "> >" in (markdown or ""):
auto_fixes.append("double_blockquote_detected")
if re.search(r"\[\d+\]|\[N\]", markdown or ""):
auto_fixes.append("reference_marker_detected")
for item in items:
if not item.url:
warnings.append({"type": "missing_url", "item_id": item.id})
if item.section not in SECTION_ORDER:
blocking_errors.append("invalid_section")
break
return {
"item_count": len(items),
"section_count": len({item.section for item in items if item.section}),
"markdown_length": len(markdown or ""),
"auto_fixes": auto_fixes,
"warnings": warnings,
"blocking_errors": blocking_errors,
}

16
config/pipeline.json Normal file
View File

@@ -0,0 +1,16 @@
{
"sections": [
"模型与能力",
"产品与应用",
"开发与基础设施",
"公司与资本",
"政策与安全",
"论文与研究",
"观点与教程",
"人物与动态"
],
"rewrite_batch_size": 10,
"semantic_dedup_max_deletion_ratio": 0.5,
"default_mode": "dry-run"
}

58
config/sources.json Normal file
View File

@@ -0,0 +1,58 @@
[
{
"name": "AI HOT",
"type": "aihot",
"role": "primary",
"required": true,
"priority": 10,
"timeout_seconds": 25,
"retries": 2,
"min_items": 10,
"enabled": true
},
{
"name": "InfoQ AI",
"type": "rss",
"url": "https://feed.infoq.com/ai-ml-data-eng/",
"role": "supplement",
"required": false,
"priority": 40,
"timeout_seconds": 25,
"retries": 1,
"enabled": true
},
{
"name": "MIT科技评论AI",
"type": "rss",
"url": "https://www.technologyreview.com/topic/artificial-intelligence/feed",
"role": "supplement",
"required": false,
"priority": 50,
"timeout_seconds": 25,
"retries": 1,
"enabled": true
},
{
"name": "量子位",
"type": "rss",
"url": "https://www.qbitai.com/feed",
"role": "supplement",
"required": false,
"priority": 30,
"timeout_seconds": 25,
"retries": 1,
"enabled": true
},
{
"name": "橘鸦AI早报",
"type": "juya_rss",
"url": "https://imjuya.github.io/juya-ai-daily/rss.xml",
"role": "supplement",
"required": false,
"priority": 20,
"timeout_seconds": 45,
"retries": 2,
"enabled": true
}
]

View File

@@ -0,0 +1,786 @@
# AI Daily Report Pipeline Optimization Plan
## Objective
This project should become a stable, long-running AI daily report system for Hermes, OpenClaw, and similar agents. The goal is not only to keep the current script runnable, but to make the whole pipeline observable, replayable, maintainable, and safe to run on a daily schedule.
The recommended direction is:
```text
stable core library + CLI + skill wrapper
```
Core business logic should live in deterministic code. The skill should describe how agents run, diagnose, replay, publish, and extend the pipeline.
## Stage Model
Use this stage model going forward:
```text
Stage 0: Collect Sources
Stage 1: Normalize Items
Stage 2: Hard Dedup
Stage 3: Semantic Dedup
Stage 4: Rewrite Titles and Summaries
Stage 5: Classify and Order
Stage 6: Guide and Daily Threads
Stage 7: Assemble and Validate Markdown
Stage 8: Publish and Deliver
```
The current script names script-level deduplication as Stage 0. That should be treated as old terminology. In the long-term pipeline, the first stage is source collection.
## Architecture
Recommended structure:
```text
ai-daily-report/
├── ai_daily_report/
│ ├── models.py
│ ├── sources/
│ │ ├── aihot.py
│ │ ├── rss.py
│ │ ├── juya.py
│ │ └── registry.py
│ ├── collect.py
│ ├── normalize.py
│ ├── dedupe.py
│ ├── llm.py
│ ├── rewrite.py
│ ├── classify.py
│ ├── assemble.py
│ ├── validate.py
│ ├── publish.py
│ └── cli.py
├── config/
│ ├── sources.json
│ └── pipeline.json
├── docs/
├── skill/
│ ├── SKILL.md
│ ├── scripts/
│ └── references/
├── tests/
│ └── fixtures/
└── script/
└── ai_daily_blog_pipeline.py
```
Keep `script/ai_daily_blog_pipeline.py` as a compatibility entrypoint during migration, but move implementation into importable modules.
## Data Model
### SourceResult
Every data source should return a structured result:
```json
{
"source": "AI HOT",
"role": "primary",
"ok": true,
"status": "ok",
"items": [],
"error": null,
"elapsed_ms": 820,
"retry_count": 0,
"fetched_at": "2026-06-04T10:00:00+08:00"
}
```
Supported statuses:
```text
ok
empty
not_ready
timeout
http_error
parse_error
disabled
```
### NewsItem
All raw source items should be normalized into one structure:
```json
{
"id": "item_...",
"source_group": "AI HOT",
"source_label": "OpenAI: Blog",
"source_role": "primary",
"source_priority": 10,
"title_raw": "...",
"title_norm": "...",
"summary_raw": "...",
"title": null,
"summary": null,
"url": "...",
"canonical_url": "...",
"published_at": "...",
"collected_at": "...",
"origin_type": "aihot_json",
"section_hint": "...",
"section": null,
"language_hint": "zh",
"quality_flags": [],
"duplicate_sources": []
}
```
Do not overwrite raw fields with LLM output. Keep display fields separate.
## Stage 0: Collect Sources
### Goal
Collect candidate news from all configured sources in a stable, observable, and recoverable way.
### Design
Use a primary-plus-supplement model at the quality layer, and parallel execution at the scheduling layer.
```text
Quality layer:
AI HOT = primary source
RSS / Juya / InfoQ / QbitAI / MIT = supplement sources
Execution layer:
start all sources concurrently with per-source timeout, retry, and reporting
```
### Source Config
Example:
```json
{
"name": "AI HOT",
"type": "aihot",
"role": "primary",
"required": true,
"priority": 10,
"timeout_seconds": 20,
"retries": 2,
"min_items": 10,
"enabled": true
}
```
Supplement source example:
```json
{
"name": "Juya AI Daily",
"type": "juya_rss",
"url": "https://imjuya.github.io/juya-ai-daily/rss.xml",
"role": "supplement",
"required": false,
"priority": 20,
"timeout_seconds": 45,
"retries": 2,
"enabled": true
}
```
### Optimizations
- Run supplement sources concurrently.
- Do not let one slow source block the whole pipeline.
- Replace the fixed Juya `sleep(120)` with bounded short retries and a clear `not_ready` or `timeout` status.
- Treat AI HOT 404 as "not ready" rather than a generic failure.
- Allow degraded generation if the primary source has a temporary network failure and supplement sources are usable.
- Persist raw source results for replay.
### Artifacts
```text
source_results.json
raw_items.json
stage0_collect_report.json
```
## Stage 1: Normalize Items
### Goal
Convert heterogeneous source output into clean, comparable, traceable `NewsItem` objects.
### Optimizations
- Normalize text with HTML stripping, entity decoding, whitespace cleanup, and RSS boilerplate removal.
- Generate stable `id` values from canonical URL when possible, otherwise from source, normalized title, and date.
- Canonicalize URLs:
- Lowercase scheme and host.
- Remove `utm_*`, `fbclid`, `gclid`, `spm`, `from`, and fragments.
- Normalize trailing slashes.
- Normalize `twitter.com` and `x.com` URLs.
- Generate `title_norm`:
- Unicode NFKC normalization.
- Lowercase English text.
- Normalize whitespace and weak punctuation.
- Preserve numbers, versions, model names, and product names.
- Standardize source labels:
- X links as `X:@username`.
- Official blogs as `OpenAI: Blog`, `Google Research: Blog`, etc.
- Avoid generic labels such as "technology media" when a domain label is available.
- Add `quality_flags` instead of silently dropping items:
- `missing_url`
- `missing_summary`
- `short_title`
- `bad_url`
- `old_item`
- `parse_suspect`
### Non-goals
- Do not dedupe.
- Do not rewrite content.
- Do not call the LLM.
- Do not remove items based on importance.
### Artifacts
```text
normalized_items.json
stage1_normalize_report.json
```
## Stage 2: Hard Dedup
### Goal
Remove only high-confidence duplicates with deterministic rules. Mark uncertain similarities for Stage 3.
### Rules
High-confidence removal:
- Same canonical URL.
- Same normalized title.
- Same platform entity, such as the same X status ID.
- Same source and same exact normalized title.
Uncertain cases:
- Similar title but different URL.
- Same company or model, but unclear whether the event is identical.
- Same topic across multiple sources with different factual details.
Uncertain cases should go to `possible_duplicates`, not be removed.
### Replacement for Current Logic
The current `SequenceMatcher > 0.7` direct deletion is too risky. Replace it with:
- Exact deterministic deletion.
- Similarity-based candidate marking only.
### Keep Item Selection
When merging a duplicate group, choose the item with a local score:
```text
official source bonus
+ primary source bonus
+ source priority
+ has URL
+ has summary
+ has section hint
+ newer published_at
- quality flag penalty
```
Attach removed items to `duplicate_sources` on the kept item.
### Artifacts
```text
deduped_items.json
stage2_dedupe_report.json
```
## Stage 3: Semantic Dedup
### Goal
Use the LLM to identify semantic duplicates that deterministic rules cannot safely remove.
### Principles
- The LLM judges duplicate candidates; local code enforces safety.
- The LLM must not select, curate, or remove items by importance.
- Only remove `confidence = high` duplicate groups.
- Treat medium or uncertain results as non-removal.
### Input
Prefer candidate groups from Stage 2. Avoid sending all items at once unless the item count is small.
Example item payload:
```json
{
"id": "item_123",
"title": "...",
"summary": "...",
"source": "QbitAI",
"url_host": "qbitai.com",
"published_at": "...",
"section_hint": "Company and Capital"
}
```
### Output Schema
```json
{
"duplicate_groups": [
{
"keep_id": "item_123",
"remove_ids": ["item_456"],
"confidence": "high",
"reason": "Both items report the same concrete event."
}
],
"not_duplicates": [],
"uncertain": []
}
```
### Safety Checks
- Validate all IDs exist.
- Validate confidence values.
- Apply local keep-item scoring instead of blindly trusting `keep_id`.
- Skip deletion if the deletion ratio exceeds a configured threshold.
- Skip deletion when versions, product names, or dates conflict.
### Failure Behavior
If timeout, JSON parse failure, or schema validation failure occurs, keep Stage 2 output and continue.
### Artifacts
```text
semantic_dedup_input.json
semantic_dedup_output.json
stage3_semantic_dedup_report.json
```
## Stage 4: Rewrite Titles and Summaries
### Goal
Produce concise, accurate Chinese display titles and summaries.
### Rules
- Keep `title_raw` and `summary_raw` unchanged.
- Write display fields to `title` and `summary`.
- Preserve brand names, model names, API names, and common technical acronyms in English.
- Translate the rest into natural Chinese.
- Avoid marketing words such as "heavyweight", "explosive", or "just now" unless they are factual and necessary.
- Summaries should be factual, concise, and usually 80-140 Chinese characters.
- Do not add facts not present in the raw title or summary.
- Do not write advice or commentary.
### Batch Strategy
- Process 8-12 items per batch.
- Allow limited parallel batches.
- Retry a failed batch once.
- Fall back per item or per batch if needed.
### Validation
Check:
- Non-empty title and summary.
- No markdown links in title.
- No URL in summary.
- No `[N]` or reference markers.
- No emoji.
- Summary length under limit.
- Key numbers, versions, and model names are preserved when present in raw input.
### Artifacts
```text
rewritten_items.json
rewrite_llm_outputs.json
stage4_rewrite_report.json
```
## Stage 5: Classify and Order
### Goal
Place each item into a stable section and order items for readable scanning.
### Recommended Sections
Use a fixed section whitelist:
```text
模型与能力
产品与应用
开发与基础设施
公司与资本
政策与安全
论文与研究
观点与教程
人物与动态
```
Hide empty sections. Do not create dynamic section names.
### Classification Strategy
Use a three-layer approach:
1. Source hint mapping.
2. Local rule fallback.
3. LLM classification for ambiguous items only.
Example alias mapping:
```text
模型发布/更新 -> 模型与能力
产品发布/更新 -> 产品与应用
产品与工具 -> 产品与应用
开发与工程 -> 开发与基础设施
行业动态 -> 公司与资本
行业与公司 -> 公司与资本
论文研究 -> 论文与研究
技巧与观点 -> 观点与教程
人物与花絮 -> 人物与动态
```
### Ordering Strategy
Do not let the LLM freely order all items. Use local scoring:
```text
rank_score =
source priority
+ official source bonus
+ primary source bonus
+ recency score
+ key metric bonus
+ duplicate source bonus
- quality flag penalty
```
Ordering is for readability only. It must not remove items.
### Artifacts
```text
classified_items.json
stage5_classify_order_report.json
```
## Stage 6: Guide and Daily Threads
### Goal
Generate a concise top guide and a bottom "daily threads" section that helps readers understand the day's shape without turning the report into an investment memo.
### Replace Current Summary Style
Do not use:
```text
强信号 / 中信号 / 待验证
```
This style feels too much like an industry rating or investment brief.
Use:
```text
导览
今日脉络
仍待确认, when needed
```
### Output Schema
The LLM should output structured JSON, not Markdown:
```json
{
"theme": "One concise daily theme.",
"threads": [
{
"title": "模型能力继续向长上下文、实时语音、多模态生成推进",
"text": "MiniMax M3、Miso One、Ideogram v4.0 分别从长上下文解码、语音克隆和图像生成质量上更新能力边界。",
"item_ids": ["item_1", "item_2", "item_3"],
"kind": "thread"
},
{
"title": "仍待确认",
"text": "融资传闻、排行榜和单源爆料类消息需要等待官方或更多来源确认。",
"item_ids": ["item_8"],
"kind": "uncertain"
}
]
}
```
### Rules
- Theme should be one paragraph under 120 Chinese characters.
- Threads should be 2-4 items.
- Each thread must bind to existing `item_ids`.
- Do not add facts absent from the item list.
- Do not write advice.
- Do not include reference numbers.
- Do not include Markdown blockquote syntax. Stage 7 will render Markdown.
### Failure Behavior
- If theme generation fails, omit the guide or use a conservative fallback.
- If threads fail, omit `今日脉络`.
- Invalid thread IDs should drop that thread.
### Artifacts
```text
guide_input.json
guide_output.json
stage6_guide_report.json
```
## Stage 7: Assemble and Validate Markdown
### Goal
Render final Markdown deterministically and validate it before publishing.
### Recommended Structure
```markdown
## 导览
> 一句话主线。
## 模型与能力
**1. 新闻标题**
> 新闻摘要。[来源 ↗](https://example.com)
## 今日脉络
- **主题**
说明...
```
### Rendering Rules
- Render Markdown in code only.
- Use global continuous numbering.
- Hide empty sections.
- Add blockquote syntax for the guide in code.
- Strip any leading `>` from LLM-provided theme text before rendering.
- Use source links consistently:
```markdown
[OpenAI: Blog ↗](https://example.com)
```
If URL is unavailable, render the source label without a link.
### Auto-fixes
- Remove `> >`.
- Remove `[N]` and numeric reference markers.
- Remove code fences from guide/thread text.
- Normalize extra blank lines.
- Add missing Chinese punctuation to summaries.
- Remove `主线判断:` prefixes if present.
### Blocking Checks
Block publish or downgrade to draft when:
- Item count is zero.
- No sections are rendered.
- Markdown is abnormally short.
- Section name is outside the whitelist.
- JSON fragments remain in Markdown.
- Link formatting is broadly broken.
- Forbidden advisory language appears in guide/thread text.
### Artifacts
```text
blog_markdown.md
stage7_markdown_report.json
```
## Stage 8: Publish and Deliver
### Goal
Publish only validated Markdown, verify the public page, and make the operation idempotent and recoverable.
### Modes
```text
dry-run
draft
publish
```
### Requirements
- Do not publish when Stage 7 has blocking errors.
- Use a deterministic slug such as `ai-YYYY-MM-DD`.
- Check whether the slug already exists before creating a new post.
- Support existence strategies:
- `skip`
- `update-draft`
- `replace`
- `republish`
- Verify the public URL with retries.
- Preserve Markdown and reports when publishing fails.
- Support publishing from an existing run directory.
### Artifacts
```text
stage8_publish_report.json
run_report.json
```
## Run Directory
Every run should write to an isolated directory:
```text
runs/2026-06-04/
source_results.json
raw_items.json
stage0_collect_report.json
normalized_items.json
stage1_normalize_report.json
deduped_items.json
stage2_dedupe_report.json
semantic_dedup_output.json
stage3_semantic_dedup_report.json
rewritten_items.json
stage4_rewrite_report.json
classified_items.json
stage5_classify_order_report.json
guide_output.json
stage6_guide_report.json
blog_markdown.md
stage7_markdown_report.json
stage8_publish_report.json
run_report.json
```
This makes the pipeline replayable and debuggable.
## CLI
Provide agent-friendly commands:
```bash
ai-daily-report run --date today --mode publish
ai-daily-report run --date today --mode dry-run
ai-daily-report run --date 2026-06-04 --mode draft
ai-daily-report replay --run-id 2026-06-04 --from-stage 4
ai-daily-report publish --from-run 2026-06-04
ai-daily-report status --date 2026-06-04
```
The current cron can keep invoking the compatibility script, which should delegate to the CLI.
## Skill Strategy
Create or update an `ai-daily-report` skill for Hermes/OpenClaw. The skill should not contain business logic. It should provide:
- How to run daily generation.
- How to dry-run.
- How to replay from an existing run.
- How to publish already generated Markdown.
- How to diagnose source, LLM, Markdown, or publish failures.
- How to add a new RSS source.
- How to adjust output style without breaking the pipeline.
Suggested skill references:
```text
skill/references/sources.md
skill/references/output-style.md
skill/references/troubleshooting.md
skill/references/llm-config.md
```
## Testing
Add fixtures and tests for:
- AI HOT sample parsing.
- RSS parsing.
- Juya `content:encoded` parsing.
- URL canonicalization.
- Title normalization.
- Deterministic deduplication.
- LLM JSON schema validation.
- Rewrite output validation.
- Section alias mapping.
- Markdown rendering.
- Markdown validation.
- Publish dry-run behavior.
Start with local fixture tests. They will give most of the stability benefit without needing live network calls.
## Migration Plan
### Phase 1: Stabilize Current Script
- Add run directories.
- Add SourceResult and stage reports.
- Add URL canonicalization.
- Replace risky Stage 0 dedupe with hard dedup.
- Add Markdown validation and auto-fixes.
### Phase 2: Improve Quality
- Add semantic dedup schema and safety checks.
- Batch rewrite title and summary.
- Add section alias mapping and rule-first classification.
- Replace the current summary with `今日脉络`.
### Phase 3: Modularize
- Extract modules under `ai_daily_report/`.
- Add CLI.
- Keep old script as compatibility entrypoint.
- Add fixture tests.
### Phase 4: Skill Integration
- Update `skill/SKILL.md`.
- Add references for sources, style, troubleshooting, and LLM config.
- Make Hermes/OpenClaw call the CLI.
## Success Criteria
The optimized pipeline should satisfy:
- A usable Markdown report is generated whenever enough source data exists.
- Optional source failures degrade the run but do not stop it.
- LLM failures degrade individual stages but do not destroy the whole report.
- No non-duplicate item is removed by importance or editorial selection.
- Every removed duplicate has a reason.
- Every stage writes inspectable artifacts.
- A failed publish can be retried from an existing run.
- Agents can run, diagnose, replay, and publish via stable commands.

View File

@@ -0,0 +1,159 @@
# Local Dry-Run Foundation Implementation Plan
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
**Goal:** Make the current pipeline testable on a local machine without Hermes credentials, blog credentials, or live LLM calls.
**Architecture:** Keep the existing single script as the compatibility entrypoint. Add small, tested helpers for project `.env` loading, dry-run token behavior, and mock LLM responses. This creates a safe base for later Stage 0-8 modularization.
**Tech Stack:** Python standard library, `unittest`, current `script/ai_daily_blog_pipeline.py`.
---
### Task 1: Add Local `.env` Loading
**Files:**
- Modify: `script/ai_daily_blog_pipeline.py`
- Create: `tests/test_env_loading.py`
**Step 1: Write the failing test**
Test that `load_env()` reads project-root `.env` values when Hermes env is absent, and that real process environment variables override file values.
**Step 2: Run test to verify it fails**
Run: `python -m unittest tests.test_env_loading -v`
Expected: FAIL because the script currently only reads `~/.hermes/.env`.
**Step 3: Implement minimal code**
Add a helper to parse env files and update `load_env()` to read:
1. Project `.env`
2. `~/.hermes/.env`
3. process environment
Later sources override earlier ones.
**Step 4: Run test to verify it passes**
Run: `python -m unittest tests.test_env_loading -v`
Expected: PASS.
### Task 2: Let Dry-Run Skip Blog Token Requirement
**Files:**
- Modify: `script/ai_daily_blog_pipeline.py`
- Create: `tests/test_dry_run_config.py`
**Step 1: Write the failing test**
Extract a small helper such as `is_dry_run(env)` and `require_blog_token(env)`, then test:
- `AI_DAILY_DRY_RUN=1` does not require `BLOG_SERVICE_TOKEN`.
- normal publish mode still requires a token.
**Step 2: Run test to verify it fails**
Run: `python -m unittest tests.test_dry_run_config -v`
Expected: FAIL because no helper exists and `main()` checks token before dry-run.
**Step 3: Implement minimal code**
Move dry-run detection before token validation in `main()`.
**Step 4: Run test to verify it passes**
Run: `python -m unittest tests.test_dry_run_config -v`
Expected: PASS.
### Task 3: Add Mock LLM Mode
**Files:**
- Modify: `script/ai_daily_blog_pipeline.py`
- Create: `tests/test_mock_llm.py`
**Step 1: Write the failing test**
Test that `llm_call(prompt, {"AI_DAILY_LLM_MODE": "mock"})` returns valid JSON for:
- semantic dedup prompts
- summary rewrite prompts
- classify prompts
Also test that guide generation can get a non-empty mock response.
**Step 2: Run test to verify it fails**
Run: `python -m unittest tests.test_mock_llm -v`
Expected: FAIL because mock mode does not exist.
**Step 3: Implement minimal code**
Add `AI_DAILY_LLM_MODE=mock` support in `llm_call()`.
**Step 4: Run test to verify it passes**
Run: `python -m unittest tests.test_mock_llm -v`
Expected: PASS.
### Task 4: Add Markdown Smoke Test
**Files:**
- Create: `tests/test_markdown_rendering.py`
- Modify: `script/ai_daily_blog_pipeline.py` only if necessary.
**Step 1: Write the failing or characterization test**
Test that `blog_markdown()` renders:
- `## 导览`
- at least one section
- source links
- no `> >`
- no `[N]`
**Step 2: Run test**
Run: `python -m unittest tests.test_markdown_rendering -v`
Expected: If it already passes, keep it as characterization coverage. If it fails because of `> >`, implement a focused fix.
**Step 3: Implement minimal fix if needed**
Strip leading `>` from guide text before adding blockquote syntax.
**Step 4: Run test to verify it passes**
Run: `python -m unittest tests.test_markdown_rendering -v`
Expected: PASS.
### Task 5: Run Full Verification
**Files:**
- No new files.
**Step 1: Run unit tests**
Run: `python -m unittest discover -s tests -v`
Expected: PASS.
**Step 2: Run compile check**
Run: `python -m py_compile script/ai_daily_blog_pipeline.py`
Expected: exit code 0.
**Step 3: Check git status**
Run: `git status --short`
Expected: only intended files are modified or added.

File diff suppressed because it is too large Load Diff

View File

@@ -1,198 +0,0 @@
## 导览
> > 微软与OpenAI正式分家、Anthropic提交招股书、DeepSeek计划融500亿——AI行业正在从“联盟军”转向“诸侯争霸”。
## 模型发布/更新
**1. Grok Imagine 1.5 预览版发布**
> Grok Imagine 1.5 预览版即日起在 API 中上线SpaceXAI 持续发力。[X@cb_doge ↗](https://x.com/cb_doge/status/2062242490745594085)
**2. MiniMax M3 1M token 解码加速 15.6 倍**
> MiniMax M3 在 1M token 下解码加速 15.6 倍FireworksAI_HQ 提供推理支持。[X@MiniMax_AI ↗](https://x.com/MiniMax_AI/status/2062316914618388758)
**3. Miso One 开源语音模型8B 参数、110ms 延迟、一次语音克隆**
> Miso One 发布 8B 参数开源语音模型,支持一次语音克隆(短样本),推理延迟 110ms权重已开源可自托管API 即将推出,演示已上线。[X@kimmonismus ↗](https://x.com/kimmonismus/status/2062210845308780639)
**4. Ideogram v4.0 发布2K 分辨率和 JSON 提示支持**
> Ideogram v4.0 发布,原生 2K 分辨率,文字渲染出色,支持 JSON 提示词,可在 Krea 中体验。[X@krea_ai ↗](https://x.com/krea_ai/status/2062227837130887567)
## 产品与工具
**5. Meta 面向 WhatsApp Business 的 AI 智能体现已全球上线**
> Meta 为 WhatsApp Business 推出的 AI 智能体面向全球商家开放,按模型 token 使用量收费。[TechCrunch ↗](https://techcrunch.com/2026/06/03/metas-ai-agent-for-whatsapp-business-is-now-available-globally)
**6. NousResearch 发布 Hermes Agent 桌面应用公测版**
> NousResearch 推出 Hermes Agent 桌面应用公测版。[X@SiliconFlowAI ↗](https://x.com/SiliconFlowAI/status/2062042813852995899)
**7. xAI Grok 语音模型上线 Vapi 平台**
> xAI 的 Grok STT 和 TTS 语音模型登陆企业语音 AI 平台 Vapi可用于构建自定义语音智能体。[X@xai ↗](https://x.com/xai/status/2062209374039499178)
**8. Grok 模型登陆 Cloudflare AI Gateway**
> Grok 模型现已可在 Cloudflare AI Gateway 上试用。[X@xai ↗](https://x.com/xai/status/2062294202625696081)
**9. OpenShell v0.0.55 发布:新增 Vertex AI 推理支持**
> OpenShell v0.0.55 发布,新增 Google Vertex AI 推理支持改进策略可见性、Podman 检测和 GPU 沙箱行为。[X@NVIDIAAI ↗](https://x.com/NVIDIAAI/status/2062210034109677665)
**10. Replit 上线 SEO Agent 助应用被发现**
> Replit 推出 SEO Agent扫描应用并提供修复建议帮助应用在网页和 AI 搜索中被发现。[X@Replit ↗](https://x.com/Replit/status/2062211976995188871)
**11. OpenClaw 2026.6.1 发布:新增 Windows 节点与技能工坊**
> OpenClaw 2026.6.1 发布,新增原生 Windows 节点主机、技能工坊和工作板编排,支持 MiniMax M3。[X@openclaw ↗](https://x.com/openclaw/status/2062288421406785710)
**12. Reachy Mini 添加 MCP 工具**
> Reachy Mini 推出公开 MCP canary Space支持远程工具调用。[Hugging FaceBlog ↗](https://huggingface.co/blog/adding-mcp-tools-to-reachy-mini)
**13. 刚刚Meta Skill 来了**
> GitHub 热门仓库 OpenSquilla 发布,代表 Meta Skill 新动向。[量子位 ↗](https://www.qbitai.com/2026/06/428335.html)
## 开发与工程
**14. Qwen Cloud 全球 AI 黑客马拉松启动**
> 首届 Qwen Cloud 全球 AI 黑客马拉松启动5 大赛道,总奖金超 7 万美元(赛道冠军 1 万美元Devpost 报名。[X@alibaba_cloud ↗](https://x.com/alibaba_cloud/status/2062113338994172169)
**15. 洪水韧性新篇章Google 开源水文建模框架**
> Google Research 开源基于 PyTorch 的水文建模框架,采用 Flood Hub 相同架构,允许各国气象部门在本地训练 AI 洪水预报模型。[Google ResearchBlog ↗](https://research.google/blog/the-next-chapter-in-flood-resilience-open-sourcing-googles-hydrology-framework)
**16. 文章:导致 Spark 在 Kubernetes 上 OOM 失败的两个错误配置**
> 迁移 Spark 到 AKS 后,两个配置交互导致 OOMspark.kubernetes.local.dirs.tmpfs 使 shuffle spill 改用 RAM 而非磁盘。[InfoQ AI ↗](https://www.infoq.com/articles/spark-oom-kubernetes-misconfigurations/?utm_campaign=infoq_content&utm_source=infoq&utm_medium=feed&utm_term=AI%2C+ML+%26+Data+Engineering)
## 行业与公司
**17. 微软与 OpenAI 分道扬镳——如今双方准备正面交锋**
> 微软与 OpenAI 合作关系破裂,进入直接竞争。微软 AI 主管 Mustafa Suleyman 称微软需独立证明能力。[The Verge ↗](https://www.theverge.com/ai-artificial-intelligence/942242/microsoft-build-ai-agents-openai-competition)
**18. 欧盟公布全面技术主权计划,推动芯片与 AI 自主发展**
> 欧盟推出技术主权计划扩大本土半导体、AI 和云计算供应链,减少对美亚依赖。[Bloomberg ↗](https://www.bloomberg.com/news/articles/2026-06-03/europe-unveils-sweeping-tech-sovereignty-plan-to-boost-chips-ai)
**19. Sensor TowerOpenAI 旗下 ChatGPT 月活已破 10 亿,史上最快**
> Sensor Tower 估计 ChatGPT 月活于 2025 年 5 月突破 10 亿增速史上最快Claude 月活 5600 万,同比增 640%。[IT之家 ↗](https://www.ithome.com/0/959/083.htm)
**20. 消息称 DeepSeek 首轮融资拟筹集 500 亿元,腾讯、宁德时代等参投**
> DeepSeek 首轮拟融资 500 亿元,投后估值 3500-4000 亿元。创始人梁文峰出资 200 亿,腾讯拟投 100 亿,宁德时代 50 亿。[IT之家 ↗](https://www.ithome.com/0/959/249.htm)
**21. Suno 完成 4 亿美元 D 轮融资**
> Suno 完成 4 亿美元 D 轮融资,估值 54 亿美元,致力于让更多人体验音乐制作。[X@suno ↗](https://x.com/suno/status/2062183524887675243)
**22. 宏利香港与阿里云达成 AI 战略合作**
> 宏利香港与阿里云建立战略合作,共建负责任 AI 创新框架,加速 AI 部署。[X@alibaba_cloud ↗](https://x.com/alibaba_cloud/status/2062006591377829922)
**23. 优步每月 1,500 美元的 AI 使用上限为 AI 工具定价提供参考**
> 优步将 AI 工具月使用上限设为 1500 美元,为行业 AI 定价提供参考信号。[Simon Willison ↗](https://simonwillison.net/2026/Jun/3/uber-caps-usage)
**24. 世界模型榜首易主!跨维智能登顶 WorldArena**
> 跨维智能在 WorldArena 上登顶,成为世界模型新榜首。[量子位 ↗](https://www.qbitai.com/2026/06/428435.html)
**25. 刚刚Anthropic 提交了招股书!**
> Anthropic 已提交招股书,预计最快 Q4 上市。[量子位 ↗](https://www.qbitai.com/2026/06/428407.html)
## 论文与研究
**26. 斯坦福大学法学院研究:人工智能的表现优于法学教授**
> 斯坦福大学法学院研究显示AI 表现优于法学教授,该结果在 Hacker News 获 104 个 Points。[law.stanford.edu ↗](https://law.stanford.edu/press/ai-outperforms-law-professors-in-stanford-law-study)
**27. NVIDIA Research 在 CVPR 2026 发表三篇论文:规模化训练实现抓取、自动驾驶与智能体泛化**
> NVIDIA Research 在 CVPR 2026 发表三篇论文:零样本抓取模型 GraspGen-X、自动驾驶 LCDrive、具身智能体 NitroGen均基于大规模训练。[blogs.nvidia.comBlog ↗](https://blogs.nvidia.com/blog/cvpr-research-grasping-driving-agent-training)
**28. Anthropic 分析 832 个 AI 恶意账户:中高风险攻击者半年从 33% 跃至 56%**
> Anthropic 分析 832 个被封恶意账户67.3% 使用 AI 编写恶意软件,中高风险占比半年内从 33% 升至 56%,传统威胁评估失效。[Anthropic ↗](https://www.anthropic.com/news/AI-enabled-cyber-threats-mitre-attack)
**29. 微软研究:装瓶厂 AI 从聊天到决策**
> 微软在中西部装瓶厂试点三个月显示AI 超越聊天进入决策领域,需应对真实风险和可靠性要求。[X@MSFTResearch ↗](https://x.com/MSFTResearch/status/2062204914223169635)
**30. 世界模型的功能分类**
> World Labs 与李飞飞发文梳理“世界模型”概念,基于 POMDP 框架分类,指出当前所谓世界模型本质是同一循环的不同投影(如渲染器)。[X@drfeifei ↗](https://x.com/drfeifei/status/2062247238143996275)
**31. 从看懂世界到做对动作,卧安机器人 OneModel 1.7 用一条「隐式通路」打通了具身智能的关键断层**
> 卧安机器人 OneModel 1.7 通过隐式通路在潜在空间完成信息传导,打通具身智能关键断层。[量子位 ↗](https://www.qbitai.com/2026/06/428703.html)
## 人物与花絮
**32. 黄仁勋与纳德拉共议智能体 AI 时代**
> 黄仁勋与纳德拉在台北 MSBuild 同台,展示 NVIDIA 与微软从 Windows 到 AI 工厂的协作。[X@nvidia ↗](https://x.com/nvidia/status/2062228974273716457)
**33. Satya Nadella 谈微软 Build 大会主旨演讲**
> Satya Nadella 在 Microsoft Build 主旨演讲,强调共同构建前沿智能生态系统。[X@satyanadella ↗](https://x.com/satyanadella/status/2062022060176801826)
**34. Karpathy 的 llm-wiki 项目获超五千星**
> @karpathy 的 llm-wiki 项目几周内获 5000+ 星,理念是让 LLM 构建并维护可持续进化的维基知识库。[X@SiliconFlowAI ↗](https://x.com/SiliconFlowAI/status/2062054848762450324)
## 观点与教程
**35. 智能体工程实战窍门全录**
> @mvanhorn 分享智能体工程方法论:人主导方向、智能体执行,核心为 plan.md 约束行为,总结 22 条实战技巧及完整工具栈。[X@shao__meng ↗](https://x.com/shao__meng/status/2061974983094755575)
**36. Anthropic 用 Claude 赋能自助数据分析**
> Anthropic 用 Claude 自动化 95% 业务分析查询,准确率约 95%,通过智能体分析栈解决概念-实体歧义等三大错误来源。[ClaudeBlog ↗](https://claude.com/blog/how-anthropic-enables-self-service-data-analytics-with-claude)
**37. 超越聊天机器人的直接偏好优化**
> Dharma-AI 在 Hugging Face 博客发文探讨直接偏好优化DPO在聊天机器人之外的广泛应用。[Hugging FaceBlog ↗](https://huggingface.co/blog/Dharma-AI/direct-preference-optimization-beyond-chatbots)
**38. 演讲:选择你的 AI 副驾驶:最大化开发效率**
> Sepehr Khosravi 探讨开发效率工具演变,评估 Cursor 和 Claude Code 等优势,为高级工程师提供可行技巧。[InfoQ AI ↗](https://www.infoq.com/presentations/choosing-ai-copilot/?utm_campaign=infoq_content&utm_source=infoq&utm_medium=feed&utm_term=AI%2C+ML+%26+Data+Engineering)
## 总结
**强信号**
- **微软与OpenAl分道扬镳双方开始正面竞争**
合作终结后微软AI主管Mustafa Suleyman称公司必须独立证明能力这意味着微软将不再依赖OpenAI的模型而是全力押注自研OpenAI也失去最大云盟友。
- **Anthropic提交招股书预计最快Q4上市**
这标志着安全派AI公司正式进入资本市场与OpenAI争夺投资者注意Claude的月活同比增长640%也为其估值提供了底气。
- **ChatGPT月活突破10亿成为史上增长最快的应用**
Sensor Tower数据显示ChatGPT在2025年5月达到这一里程碑Claude月活5600万两家头部消费级AI应用的用户粘性正在拉开差距。
**中信号**
- **Miso One发布8B开源语音模型支持一次语音克隆且延迟仅110ms**
权重已开放、可自托管意味着实时语音克隆的门槛从专有API降到了个人部署可能加速语音交互在开发者中的普及。
- **欧盟公布全面技术主权计划推动芯片与AI自主发展**
计划扩大本土半导体、AI和云计算供应链目标减少对美亚依赖——这将对全球AI公司的合规、市场准入和数据主权产生实质影响。
**待验证**
- **DeepSeek首轮融资拟筹500亿元腾讯、宁德时代参投**
投后估值高达3500-4000亿元但融资消息来源为IT之家未见官方确认。如此大体量的AI融资在国内市场是否顺利落地存在不确定性。
- **跨维智能登顶WorldArena世界模型榜首**
WorldArena的评测权威性尚未被广泛验证且“世界模型”概念本身缺乏统一标准需要看后续是否有独立第三方复现其能力。

View File

@@ -1,35 +0,0 @@
{
"date": "2026-06-04",
"slug": "ai-2026-06-04",
"blog_url": "https://blog.ephron.ren/posts/ai-2026-06-04",
"public_ok": true,
"errors": [
"橘鸦AI早报(重试): TimeoutError"
],
"aihot_sections": [
"模型发布/更新",
"产品发布/更新",
"行业动态",
"论文研究",
"技巧与观点"
],
"raw_item_count": 39,
"stage0_count": 39,
"final_item_count": 38,
"has_juya": false,
"source_counts": {
"AI HOT": 32,
"InfoQ AI": 2,
"MIT科技评论AI": 0,
"量子位": 5,
"橘鸦AI早报": 0
},
"featured_titles": [
"Grok Imagine 1.5 预览版发布",
"MiniMax M3 1M token 解码加速 15.6 倍",
"Miso One 开源语音模型8B 参数、110ms 延迟、一次语音克隆",
"Ideogram v4.0 发布2K 分辨率和 JSON 提示支持",
"Meta 面向 WhatsApp Business 的 AI 智能体现已全球上线",
"NousResearch 发布 Hermes Agent 桌面应用公测版"
]
}

1
skill/scripts/.gitkeep Normal file
View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,7 @@
#!/usr/bin/env python3
from ai_daily_report.cli import main
if __name__ == "__main__":
raise SystemExit(main())

1
tests/fixtures/.gitkeep vendored Normal file
View File

@@ -0,0 +1 @@

47
tests/test_cli.py Normal file
View File

@@ -0,0 +1,47 @@
import unittest
from pathlib import Path
from tempfile import TemporaryDirectory
from ai_daily_report.cli import build_parser, main
class CliTests(unittest.TestCase):
def test_run_command_parses_date_and_mode(self):
parser = build_parser()
args = parser.parse_args(["run", "--date", "2026-06-04", "--mode", "dry-run", "--source-mode", "live", "--llm-mode", "live", "--sources-path", "config/sources.json"])
self.assertEqual(args.command, "run")
self.assertEqual(args.date, "2026-06-04")
self.assertEqual(args.mode, "dry-run")
self.assertEqual(args.source_mode, "live")
self.assertEqual(args.llm_mode, "live")
self.assertEqual(args.sources_path, "config/sources.json")
def test_main_returns_zero_for_parseable_command(self):
self.assertEqual(main(["run", "--date", "2026-06-04", "--mode", "dry-run"]), 0)
def test_main_mock_run_writes_outputs(self):
with TemporaryDirectory() as temp_dir:
exit_code = main(
[
"run",
"--date",
"2026-06-04",
"--mode",
"dry-run",
"--source-mode",
"mock",
"--llm-mode",
"mock",
"--out-dir",
temp_dir,
]
)
self.assertEqual(exit_code, 0)
self.assertTrue((Path(temp_dir) / "2026-06-04" / "blog_markdown.md").exists())
if __name__ == "__main__":
unittest.main()

47
tests/test_clients.py Normal file
View File

@@ -0,0 +1,47 @@
import json
import unittest
from unittest.mock import patch
from ai_daily_report.clients import BlogApiClient, OpenAICompatibleClient, fetch_text
class FakeResponse:
status = 200
def __init__(self, body):
self.body = body
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
return False
def read(self):
return self.body
class ClientTests(unittest.TestCase):
def test_fetch_text_decodes_response(self):
with patch("urllib.request.urlopen", return_value=FakeResponse("ok".encode("utf-8"))):
self.assertEqual(fetch_text("https://example.com", 1), "ok")
def test_openai_compatible_client_returns_message_content(self):
body = json.dumps({"choices": [{"message": {"content": "hello"}}]}).encode("utf-8")
with patch("urllib.request.urlopen", return_value=FakeResponse(body)):
client = OpenAICompatibleClient(api_key="key", base_url="https://llm.example/v1", model="model")
self.assertEqual(client.chat("prompt"), "hello")
def test_blog_api_client_create_and_publish(self):
responses = [
FakeResponse(json.dumps({"slug": "ai-2026-06-04"}).encode("utf-8")),
FakeResponse(json.dumps({"ok": True}).encode("utf-8")),
]
with patch("urllib.request.urlopen", side_effect=responses):
client = BlogApiClient(base_url="https://blog.example", token="token")
self.assertEqual(client.create_post({"title": "t"})["slug"], "ai-2026-06-04")
client.publish_post("ai-2026-06-04")
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,27 @@
import unittest
from pathlib import Path
from ai_daily_report.config import load_source_configs
from ai_daily_report.sources.registry import get_source_fetcher
ROOT = Path(__file__).resolve().parents[1]
class ConfigLoadingTests(unittest.TestCase):
def test_load_source_configs_from_json(self):
configs = load_source_configs(ROOT / "config" / "sources.json")
self.assertGreaterEqual(len(configs), 5)
self.assertEqual(configs[0].name, "AI HOT")
self.assertEqual(configs[0].type, "aihot")
def test_all_configured_source_types_are_registered(self):
configs = load_source_configs(ROOT / "config" / "sources.json")
for config in configs:
self.assertTrue(callable(get_source_fetcher(config.type)))
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,33 @@
import importlib.util
import unittest
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
SCRIPT = ROOT / "script" / "ai_daily_blog_pipeline.py"
def load_pipeline_module():
spec = importlib.util.spec_from_file_location("ai_daily_blog_pipeline", SCRIPT)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
class DryRunConfigTests(unittest.TestCase):
def test_dry_run_does_not_require_blog_token(self):
module = load_pipeline_module()
self.assertTrue(module.is_dry_run({"AI_DAILY_DRY_RUN": "1"}))
self.assertFalse(module.requires_blog_token({"AI_DAILY_DRY_RUN": "1"}))
def test_publish_mode_requires_blog_token(self):
module = load_pipeline_module()
self.assertFalse(module.is_dry_run({}))
self.assertTrue(module.requires_blog_token({}))
if __name__ == "__main__":
unittest.main()

87
tests/test_env_config.py Normal file
View File

@@ -0,0 +1,87 @@
import unittest
from pathlib import Path
from tempfile import TemporaryDirectory
from ai_daily_report.env import resolve_blog_token, resolve_llm_config
class EnvConfigTests(unittest.TestCase):
def test_resolve_llm_config_prefers_generic_values(self):
config = resolve_llm_config(
{
"LLM_API_KEY": "generic-key",
"LLM_BASE_URL": "https://generic.example/v1",
"LLM_MODEL": "generic-model",
"SUB2API_API_KEY": "sub-key",
"SUB2API_BASE_URL": "https://sub.example/v1",
"SUB2API_MODEL": "sub-model",
}
)
self.assertEqual(
config,
{
"api_key": "generic-key",
"base_url": "https://generic.example/v1",
"model": "generic-model",
},
)
def test_resolve_llm_config_reports_missing_fields(self):
with self.assertRaisesRegex(ValueError, "missing_llm_config: LLM_BASE_URL,LLM_MODEL"):
resolve_llm_config({"LLM_API_KEY": "key"})
def test_resolve_llm_config_follows_hermes_provider_config(self):
with TemporaryDirectory() as temp_dir:
hermes_dir = Path(temp_dir)
(hermes_dir / "config.yaml").write_text(
"""
model:
provider: sub2api
default: findmini/gpt-5.5
base_url: http://sub2api.example/v1
""".strip(),
encoding="utf-8",
)
(hermes_dir / ".env").write_text("SUB2API_API_KEY=hermes-key\n", encoding="utf-8")
config = resolve_llm_config({}, hermes_dir=hermes_dir)
self.assertEqual(
config,
{
"api_key": "hermes-key",
"base_url": "http://sub2api.example/v1",
"model": "findmini/gpt-5.5",
},
)
def test_resolve_llm_config_uses_hermes_auth_json_env_source(self):
with TemporaryDirectory() as temp_dir:
hermes_dir = Path(temp_dir)
(hermes_dir / "config.yaml").write_text(
"""
model:
provider: sub2api
default: findmini/gpt-5.5
base_url: http://sub2api.example/v1
""".strip(),
encoding="utf-8",
)
(hermes_dir / "auth.json").write_text(
'{"credential_pool": {"sub2api": [{"source": "env:SUB2API_API_KEY"}]}}',
encoding="utf-8",
)
config = resolve_llm_config({"SUB2API_API_KEY": "auth-env-key"}, hermes_dir=hermes_dir)
self.assertEqual(config["api_key"], "auth-env-key")
self.assertEqual(config["base_url"], "http://sub2api.example/v1")
self.assertEqual(config["model"], "findmini/gpt-5.5")
def test_resolve_blog_token_uses_supported_names(self):
self.assertEqual(resolve_blog_token({"EPHRON_SERVICE_TOKEN": "token"}), "token")
if __name__ == "__main__":
unittest.main()

39
tests/test_env_loading.py Normal file
View File

@@ -0,0 +1,39 @@
import importlib.util
import os
import unittest
from pathlib import Path
from unittest.mock import patch
ROOT = Path(__file__).resolve().parents[1]
SCRIPT = ROOT / "script" / "ai_daily_blog_pipeline.py"
def load_pipeline_module():
spec = importlib.util.spec_from_file_location("ai_daily_blog_pipeline", SCRIPT)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
class EnvLoadingTests(unittest.TestCase):
def test_project_env_is_loaded_and_process_env_wins(self):
module = load_pipeline_module()
env_text = "LLM_MODEL=file-model\nLLM_BASE_URL=https://file.example/v1\n"
with patch.object(module.Path, "home", return_value=ROOT / "missing-home"):
with patch.dict(os.environ, {"LLM_MODEL": "process-model"}, clear=False):
with patch.object(module, "PROJECT_ENV_PATH", ROOT / ".env.test"):
(ROOT / ".env.test").write_text(env_text, encoding="utf-8")
try:
env = module.load_env()
finally:
(ROOT / ".env.test").unlink(missing_ok=True)
self.assertEqual(env["LLM_BASE_URL"], "https://file.example/v1")
self.assertEqual(env["LLM_MODEL"], "process-model")
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,57 @@
import importlib.util
import unittest
from pathlib import Path
from unittest.mock import patch
ROOT = Path(__file__).resolve().parents[1]
SCRIPT = ROOT / "script" / "ai_daily_blog_pipeline.py"
def load_pipeline_module():
spec = importlib.util.spec_from_file_location("ai_daily_blog_pipeline", SCRIPT)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
class LegacyScriptDelegationTests(unittest.TestCase):
def test_main_delegates_to_new_pipeline_by_default(self):
module = load_pipeline_module()
calls = []
def fake_run_daily_report(**kwargs):
calls.append(kwargs)
return {"reports": {"stage8": {"status": "ok"}}}
with patch.object(module, "load_env", return_value={"AI_DAILY_DRY_RUN": "1"}):
with patch("ai_daily_report.runner.run_daily_report", side_effect=fake_run_daily_report):
module.main()
self.assertEqual(len(calls), 1)
self.assertEqual(calls[0]["mode"], "dry-run")
self.assertEqual(calls[0]["source_mode"], "live")
self.assertEqual(calls[0]["llm_mode"], "live")
def test_main_allows_mock_modes_for_local_test(self):
module = load_pipeline_module()
calls = []
def fake_run_daily_report(**kwargs):
calls.append(kwargs)
return {"reports": {"stage8": {"status": "ok"}}}
with patch.object(
module,
"load_env",
return_value={"AI_DAILY_DRY_RUN": "1", "AI_DAILY_SOURCE_MODE": "mock", "AI_DAILY_LLM_MODE": "mock"},
):
with patch("ai_daily_report.runner.run_daily_report", side_effect=fake_run_daily_report):
module.main()
self.assertEqual(calls[0]["source_mode"], "mock")
self.assertEqual(calls[0]["llm_mode"], "mock")
if __name__ == "__main__":
unittest.main()

17
tests/test_llm_utils.py Normal file
View File

@@ -0,0 +1,17 @@
import unittest
from ai_daily_report.llm import parse_json_object
class LlmUtilsTests(unittest.TestCase):
def test_parse_json_object_strips_markdown_fence(self):
self.assertEqual(parse_json_object('```json\n{"ok": true}\n```'), {"ok": True})
def test_parse_json_object_raises_without_json(self):
with self.assertRaises(ValueError):
parse_json_object("not json")
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,39 @@
import unittest
from ai_daily_report.assemble import assemble_markdown
from ai_daily_report.models import NewsItem
class MarkdownRenderingTests(unittest.TestCase):
def test_blog_markdown_strips_double_blockquote_and_reference_markers(self):
items = [
NewsItem(
id="a",
source_group="AI HOT",
source_label="OpenAIBlog",
source_role="primary",
source_priority=10,
title_raw="测试模型发布",
title_norm="测试模型发布",
summary_raw="测试摘要",
title="测试模型发布",
summary="测试摘要",
url="https://openai.com/blog/test",
canonical_url="https://openai.com/blog/test",
section="模型与能力",
)
]
guide = {"theme": "> 主线判断:测试主线[1]", "threads": []}
md, _ = assemble_markdown(items, guide)
self.assertIn("## 导览", md)
self.assertIn("## 模型与能力", md)
self.assertIn("[OpenAIBlog ↗](https://openai.com/blog/test)", md)
self.assertNotIn("> >", md)
self.assertNotIn("[1]", md)
self.assertNotIn("主线判断", md)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,33 @@
import unittest
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
class ProjectStructureTests(unittest.TestCase):
def test_pipeline_plan_structure_exists(self):
expected_paths = [
"ai_daily_report/sources/__init__.py",
"ai_daily_report/sources/aihot.py",
"ai_daily_report/sources/rss.py",
"ai_daily_report/sources/juya.py",
"ai_daily_report/sources/registry.py",
"ai_daily_report/llm.py",
"ai_daily_report/validate.py",
"ai_daily_report/publish.py",
"ai_daily_report/cli.py",
"config/sources.json",
"config/pipeline.json",
"tests/fixtures/.gitkeep",
"skill/scripts/.gitkeep",
"skill/scripts/run_daily_report.py",
]
missing = [path for path in expected_paths if not (ROOT / path).exists()]
self.assertEqual(missing, [])
if __name__ == "__main__":
unittest.main()

132
tests/test_runner.py Normal file
View File

@@ -0,0 +1,132 @@
import unittest
import json
from pathlib import Path
from tempfile import TemporaryDirectory
from ai_daily_report.runner import run_daily_report
class RunnerTests(unittest.TestCase):
def test_run_daily_report_mock_mode_writes_markdown_and_reports(self):
with TemporaryDirectory() as temp_dir:
result = run_daily_report(
run_date="2026-06-04",
mode="dry-run",
source_mode="mock",
llm_mode="mock",
out_dir=Path(temp_dir),
base_url="https://blog.example",
)
run_dir = Path(result["run_dir"])
self.assertTrue((run_dir / "blog_markdown.md").exists())
self.assertTrue((run_dir / "run_report.json").exists())
self.assertEqual(result["reports"]["stage8"]["status"], "ok")
def test_run_daily_report_live_sources_can_use_config_and_fetch_text(self):
with TemporaryDirectory() as temp_dir:
out_dir = Path(temp_dir) / "out"
source_config = Path(temp_dir) / "sources.json"
source_config.write_text(
json.dumps(
[
{
"name": "InfoQ AI",
"type": "rss",
"url": "https://feed.example/rss",
"role": "supplement",
"priority": 40,
"enabled": True,
}
]
),
encoding="utf-8",
)
def fetch_text(url, timeout):
return """<?xml version="1.0"?><rss><channel><item><title>GPT-5 API 发布</title><link>https://example.com/gpt5</link><description>OpenAI 发布 GPT-5 API。</description></item></channel></rss>"""
result = run_daily_report(
run_date="2026-06-04",
mode="dry-run",
source_mode="live",
llm_mode="mock",
out_dir=out_dir,
base_url="https://blog.example",
sources_path=source_config,
fetch_text=fetch_text,
)
self.assertEqual(result["reports"]["stage0"]["raw_item_count"], 1)
self.assertTrue((out_dir / "2026-06-04" / "blog_markdown.md").exists())
def test_run_daily_report_live_llm_uses_env_config_in_dry_run(self):
class FakeLlmClient:
def __init__(self):
self.prompts = []
def chat(self, prompt):
self.prompts.append(prompt)
if "duplicate_groups" in prompt:
return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
if "rewrites" in prompt:
payload = json.loads(prompt)
return json.dumps(
{
"rewrites": [
{
"id": item["id"],
"title": item["title_raw"],
"summary": item["summary_raw"],
"flags": [],
}
for item in payload["items"]
]
}
)
return json.dumps(
{
"theme": "模型能力继续进入产品入口。",
"threads": [
{
"title": "模型 API 更新",
"text": "GPT-5 API 发布,说明模型能力继续进入产品入口。",
"item_ids": [json.loads(prompt)["items"][0]["id"]],
"kind": "thread",
}
],
}
)
fake_client = FakeLlmClient()
captured_config = {}
def llm_client_factory(**config):
captured_config.update(config)
return fake_client
with TemporaryDirectory() as temp_dir:
result = run_daily_report(
run_date="2026-06-04",
mode="dry-run",
source_mode="mock",
llm_mode="live",
out_dir=Path(temp_dir),
base_url="https://blog.example",
env={
"LLM_API_KEY": "test-key",
"LLM_BASE_URL": "https://llm.example/v1",
"LLM_MODEL": "test-model",
},
llm_client_factory=llm_client_factory,
)
self.assertEqual(captured_config["api_key"], "test-key")
self.assertEqual(captured_config["base_url"], "https://llm.example/v1")
self.assertEqual(captured_config["model"], "test-model")
self.assertGreaterEqual(len(fake_client.prompts), 2)
self.assertEqual(result["reports"]["stage8"]["status"], "ok")
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,55 @@
import unittest
from ai_daily_report.models import SourceConfig
from ai_daily_report.sources.juya import parse_juya_rss
from ai_daily_report.sources.labels import source_label_from_url
class SourceLabelTests(unittest.TestCase):
def test_source_label_from_x_url_includes_handle(self):
self.assertEqual(
source_label_from_url("https://x.com/MiniMax_AI/status/123", fallback="橘鸦AI早报"),
"XMiniMax (@MiniMax_AI)",
)
def test_source_label_from_blog_url_marks_blog(self):
self.assertEqual(
source_label_from_url("https://openai.com/blog/example", fallback="橘鸦AI早报"),
"OpenAIBlog",
)
def test_source_label_from_known_non_blog_domains(self):
self.assertEqual(
source_label_from_url("https://mp.weixin.qq.com/s/example", fallback="橘鸦AI早报"),
"微信公众号",
)
self.assertEqual(
source_label_from_url("https://platform.minimaxi.com/docs/token-plan/migration", fallback="橘鸦AI早报"),
"MiniMaxDocs",
)
def test_parse_juya_rss_uses_item_url_as_source_label(self):
config = SourceConfig(name="橘鸦AI早报", type="juya_rss", url="https://juya.example/rss")
xml = """<?xml version="1.0"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<item>
<title>2026-06-04</title>
<content:encoded><![CDATA[
<h2><a href="https://x.com/MiniMax_AI/status/123">MiniMax M3 加速</a> <code>#1</code></h2>
<p>MiniMax M3 加速。</p>
<p><a href="https://x.com/MiniMax_AI/status/123">来源</a></p>
<hr/>
]]></content:encoded>
</item>
</channel>
</rss>"""
items = parse_juya_rss(config, xml, "2026-06-04")
self.assertEqual(items[0]["source_label"], "XMiniMax (@MiniMax_AI)")
self.assertNotEqual(items[0]["source_label"], "橘鸦AI早报")
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,49 @@
import unittest
from ai_daily_report.collect import collect_sources
from ai_daily_report.models import SourceConfig
class Stage0CollectTests(unittest.TestCase):
def test_collect_sources_returns_structured_results_for_each_source(self):
configs = [
SourceConfig(name="Primary", type="fake", role="primary", priority=10),
SourceConfig(name="Supplement", type="fake", role="supplement", priority=20),
]
def fetcher(config, run_date):
return [{"title_raw": f"{config.name} item", "url": f"https://example.com/{config.name}"}]
results, report = collect_sources(configs, "2026-06-04", fetcher=fetcher)
self.assertEqual([r.source for r in results], ["Primary", "Supplement"])
self.assertTrue(all(r.ok for r in results))
self.assertEqual(sum(len(r.items) for r in results), 2)
self.assertEqual(report["input_source_count"], 2)
self.assertEqual(report["ok_source_count"], 2)
self.assertEqual(report["raw_item_count"], 2)
def test_collect_sources_records_failed_source_without_blocking_others(self):
configs = [
SourceConfig(name="Broken", type="fake", role="supplement", priority=20),
SourceConfig(name="Healthy", type="fake", role="supplement", priority=30),
]
def fetcher(config, run_date):
if config.name == "Broken":
raise TimeoutError("timed out")
return [{"title_raw": "healthy item", "url": "https://example.com/healthy"}]
results, report = collect_sources(configs, "2026-06-04", fetcher=fetcher)
by_source = {r.source: r for r in results}
self.assertFalse(by_source["Broken"].ok)
self.assertEqual(by_source["Broken"].status, "timeout")
self.assertIn("TimeoutError", by_source["Broken"].error)
self.assertTrue(by_source["Healthy"].ok)
self.assertEqual(report["failed_source_count"], 1)
self.assertEqual(report["raw_item_count"], 1)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,32 @@
import unittest
from ai_daily_report.pipeline import run_stage0_to_stage2
class Stage0To2PipelineTests(unittest.TestCase):
def test_run_stage0_to_stage2_returns_deduped_items_and_reports(self):
configs = [
{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10},
{"name": "RSS", "type": "fake", "role": "supplement", "priority": 50},
]
def fetcher(config, run_date):
return [
{
"title_raw": "OpenAI 发布 GPT-5",
"summary_raw": f"{config.name} summary",
"url": "https://openai.com/blog/gpt-5?utm_source=test",
"source_label": config.name,
}
]
result = run_stage0_to_stage2(configs, "2026-06-04", fetcher=fetcher)
self.assertEqual(len(result["items"]), 1)
self.assertEqual(result["reports"]["stage0"]["raw_item_count"], 2)
self.assertEqual(result["reports"]["stage1"]["output_count"], 2)
self.assertEqual(result["reports"]["stage2"]["removed_count"], 1)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,66 @@
import json
import unittest
from ai_daily_report.pipeline import run_stage0_to_stage4
class Stage0To4PipelineTests(unittest.TestCase):
def test_run_stage0_to_stage4_semantic_dedupes_and_rewrites(self):
configs = [
{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10},
{"name": "RSS", "type": "fake", "role": "supplement", "priority": 50},
]
def fetcher(config, run_date):
return [
{
"title_raw": f"{config.name} Anthropic IPO",
"summary_raw": f"{config.name} reports Anthropic IPO filing.",
"url": f"https://example.com/{config.name}",
"source_label": config.name,
}
]
def semantic_llm_call(prompt):
return json.dumps(
{
"duplicate_groups": [],
"not_duplicates": [],
"uncertain": [],
}
)
def rewrite_llm_call(prompt):
payload = json.loads(prompt)
return json.dumps(
{
"rewrites": [
{
"id": entry["id"],
"title": "Anthropic 提交 IPO 文件",
"summary": "Anthropic 被报道提交 IPO 文件。",
"flags": [],
}
for entry in payload["items"]
]
},
ensure_ascii=False,
)
result = run_stage0_to_stage4(
configs,
"2026-06-04",
fetcher=fetcher,
semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call,
)
self.assertEqual(len(result["items"]), 2)
self.assertEqual(result["items"][0].title, "Anthropic 提交 IPO 文件")
self.assertIn("stage3", result["reports"])
self.assertIn("stage4", result["reports"])
self.assertEqual(result["reports"]["stage4"]["rewritten_count"], 2)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,62 @@
import json
import unittest
from ai_daily_report.pipeline import run_stage0_to_stage5
class Stage0To5PipelineTests(unittest.TestCase):
def test_run_stage0_to_stage5_classifies_and_orders_items(self):
configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}]
def fetcher(config, run_date):
return [
{
"title_raw": "Anthropic 提交 IPO 文件",
"summary_raw": "Anthropic 被报道提交 IPO 文件。",
"url": "https://example.com/ipo",
"source_label": config.name,
},
{
"title_raw": "GPT-5 API 发布,延迟降低 30%",
"summary_raw": "OpenAI 发布 GPT-5 API。",
"url": "https://example.com/gpt5",
"source_label": config.name,
"section_hint": "模型发布/更新",
},
]
def semantic_llm_call(prompt):
return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
def rewrite_llm_call(prompt):
payload = json.loads(prompt)
return json.dumps(
{
"rewrites": [
{
"id": entry["id"],
"title": entry["title_raw"],
"summary": entry["summary_raw"],
"flags": [],
}
for entry in payload["items"]
]
},
ensure_ascii=False,
)
result = run_stage0_to_stage5(
configs,
"2026-06-04",
fetcher=fetcher,
semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call,
)
self.assertEqual([item.section for item in result["items"]], ["模型与能力", "公司与资本"])
self.assertEqual(result["reports"]["stage5"]["section_counts"]["模型与能力"], 1)
self.assertEqual(result["reports"]["stage5"]["section_counts"]["公司与资本"], 1)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,75 @@
import json
import unittest
from ai_daily_report.pipeline import run_stage0_to_stage6
class Stage0To6PipelineTests(unittest.TestCase):
def test_run_stage0_to_stage6_generates_guide(self):
configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}]
def fetcher(config, run_date):
return [
{
"title_raw": "GPT-5 API 发布",
"summary_raw": "OpenAI 发布 GPT-5 API。",
"url": "https://example.com/gpt5",
"source_label": config.name,
"section_hint": "模型发布/更新",
}
]
def semantic_llm_call(prompt):
return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
def rewrite_llm_call(prompt):
payload = json.loads(prompt)
return json.dumps(
{
"rewrites": [
{
"id": entry["id"],
"title": entry["title_raw"],
"summary": entry["summary_raw"],
"flags": [],
}
for entry in payload["items"]
]
},
ensure_ascii=False,
)
def guide_llm_call(prompt):
payload = json.loads(prompt)
item_id = payload["items"][0]["id"]
return json.dumps(
{
"theme": "模型 API 能力继续更新。",
"threads": [
{
"title": "模型能力更新",
"text": "GPT-5 API 发布,体现模型能力继续产品化。",
"item_ids": [item_id],
"kind": "thread",
}
],
},
ensure_ascii=False,
)
result = run_stage0_to_stage6(
configs,
"2026-06-04",
fetcher=fetcher,
semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call,
guide_llm_call=guide_llm_call,
)
self.assertEqual(result["guide"]["theme"], "模型 API 能力继续更新。")
self.assertEqual(len(result["guide"]["threads"]), 1)
self.assertTrue(result["reports"]["stage6"]["theme_present"])
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,76 @@
import json
import unittest
from ai_daily_report.pipeline import run_stage0_to_stage7
class Stage0To7PipelineTests(unittest.TestCase):
def test_run_stage0_to_stage7_assembles_markdown(self):
configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}]
def fetcher(config, run_date):
return [
{
"title_raw": "GPT-5 API 发布",
"summary_raw": "OpenAI 发布 GPT-5 API。",
"url": "https://example.com/gpt5",
"source_label": "OpenAIBlog",
"section_hint": "模型发布/更新",
}
]
def semantic_llm_call(prompt):
return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
def rewrite_llm_call(prompt):
payload = json.loads(prompt)
return json.dumps(
{
"rewrites": [
{
"id": entry["id"],
"title": entry["title_raw"],
"summary": entry["summary_raw"],
"flags": [],
}
for entry in payload["items"]
]
},
ensure_ascii=False,
)
def guide_llm_call(prompt):
payload = json.loads(prompt)
item_id = payload["items"][0]["id"]
return json.dumps(
{
"theme": "模型 API 能力继续更新。",
"threads": [
{
"title": "模型能力产品化",
"text": "GPT-5 API 发布,说明模型能力继续进入产品入口。",
"item_ids": [item_id],
"kind": "thread",
}
],
},
ensure_ascii=False,
)
result = run_stage0_to_stage7(
configs,
"2026-06-04",
fetcher=fetcher,
semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call,
guide_llm_call=guide_llm_call,
)
self.assertIn("## 导览", result["markdown"])
self.assertIn("## 模型与能力", result["markdown"])
self.assertIn("## 今日脉络", result["markdown"])
self.assertEqual(result["reports"]["stage7"]["blocking_errors"], [])
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,79 @@
import json
import unittest
from ai_daily_report.pipeline import run_stage0_to_stage8
class Stage0To8PipelineTests(unittest.TestCase):
def test_run_stage0_to_stage8_dry_run_publishes_report(self):
configs = [{"name": "AI HOT", "type": "fake", "role": "primary", "priority": 10}]
def fetcher(config, run_date):
return [
{
"title_raw": "GPT-5 API 发布",
"summary_raw": "OpenAI 发布 GPT-5 API。",
"url": "https://example.com/gpt5",
"source_label": "OpenAIBlog",
"section_hint": "模型发布/更新",
}
]
def semantic_llm_call(prompt):
return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []})
def rewrite_llm_call(prompt):
payload = json.loads(prompt)
return json.dumps(
{
"rewrites": [
{
"id": entry["id"],
"title": entry["title_raw"],
"summary": entry["summary_raw"],
"flags": [],
}
for entry in payload["items"]
]
},
ensure_ascii=False,
)
def guide_llm_call(prompt):
payload = json.loads(prompt)
item_id = payload["items"][0]["id"]
return json.dumps(
{
"theme": "模型 API 能力继续更新。",
"threads": [
{
"title": "模型能力产品化",
"text": "GPT-5 API 发布,说明模型能力继续进入产品入口。",
"item_ids": [item_id],
"kind": "thread",
}
],
},
ensure_ascii=False,
)
result = run_stage0_to_stage8(
configs,
"2026-06-04",
fetcher=fetcher,
semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call,
guide_llm_call=guide_llm_call,
mode="dry-run",
base_url="https://blog.example",
client=None,
)
self.assertEqual(result["publish"].status, "ok")
self.assertEqual(result["publish"].blog_url, "https://blog.example/posts/ai-2026-06-04")
self.assertIn("stage8", result["reports"])
self.assertEqual(result["reports"]["stage8"]["status"], "ok")
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,85 @@
import unittest
from ai_daily_report.models import SourceResult
from ai_daily_report.normalize import canonicalize_url, normalize_items, normalize_title
class Stage1NormalizeTests(unittest.TestCase):
def test_canonicalize_url_removes_tracking_and_normalizes_x_host(self):
url = "HTTPS://Twitter.com/OpenAI/status/123/?utm_source=newsletter&fbclid=abc#fragment"
self.assertEqual(canonicalize_url(url), "https://x.com/OpenAI/status/123")
def test_normalize_items_builds_news_items_with_ids_and_norms(self):
source_result = SourceResult(
source="AI HOT",
role="primary",
ok=True,
status="ok",
items=[
{
"title_raw": " GPT-5 发布:速度提升 2x ",
"summary_raw": " <p>OpenAI 发布更新。</p> ",
"url": "https://openai.com/blog/gpt-5?utm_campaign=test",
"source_label": "OpenAIBlog",
"section_hint": "模型发布/更新",
}
],
)
items, report = normalize_items([source_result], run_date="2026-06-04")
self.assertEqual(len(items), 1)
self.assertTrue(items[0].id.startswith("item_"))
self.assertEqual(items[0].canonical_url, "https://openai.com/blog/gpt-5")
self.assertEqual(items[0].title_norm, normalize_title("GPT-5 发布:速度提升 2x"))
self.assertEqual(items[0].summary_raw, "OpenAI 发布更新。")
self.assertEqual(items[0].source_role, "primary")
self.assertEqual(report["input_count"], 1)
self.assertEqual(report["output_count"], 1)
def test_normalize_items_marks_quality_flags_without_dropping_item(self):
source_result = SourceResult(
source="RSS",
role="supplement",
ok=True,
status="ok",
items=[{"title_raw": "", "summary_raw": "", "url": ""}],
)
items, report = normalize_items([source_result], run_date="2026-06-04")
self.assertEqual(len(items), 1)
self.assertIn("missing_url", items[0].quality_flags)
self.assertIn("missing_summary", items[0].quality_flags)
self.assertIn("short_title", items[0].quality_flags)
self.assertEqual(report["quality_flag_counts"]["missing_url"], 1)
def test_normalize_items_keeps_ids_unique_for_same_canonical_url(self):
source_result = SourceResult(
source="AI HOT",
role="primary",
ok=True,
status="ok",
items=[
{
"title_raw": "OpenAI 发布 GPT-5",
"summary_raw": "summary a",
"url": "https://example.com/news?utm_source=a",
},
{
"title_raw": "OpenAI 发布 GPT-5",
"summary_raw": "summary b",
"url": "https://example.com/news",
},
],
)
items, _ = normalize_items([source_result], run_date="2026-06-04")
self.assertEqual(len({item.id for item in items}), 2)
self.assertEqual(items[0].canonical_url, items[1].canonical_url)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,63 @@
import unittest
from ai_daily_report.dedupe import hard_dedup_items
from ai_daily_report.models import NewsItem
def item(
item_id,
title,
title_norm,
url,
canonical_url,
source_group="AI HOT",
source_label="AI HOT",
source_priority=100,
summary="summary",
):
return NewsItem(
id=item_id,
source_group=source_group,
source_label=source_label,
source_role="primary" if source_group == "AI HOT" else "supplement",
source_priority=source_priority,
title_raw=title,
title_norm=title_norm,
summary_raw=summary,
url=url,
canonical_url=canonical_url,
)
class Stage2DedupeTests(unittest.TestCase):
def test_hard_dedup_merges_same_canonical_url_and_keeps_better_item(self):
items = [
item("a", "OpenAI 发布 GPT-5", "openai发布gpt5", "https://example.com/a?utm_source=x", "https://example.com/a", source_group="RSS", source_priority=50, summary="short"),
item("b", "OpenAI 发布 GPT-5", "openai发布gpt5", "https://example.com/a", "https://example.com/a", source_group="AI HOT", source_priority=10, summary="longer summary"),
]
deduped, report = hard_dedup_items(items)
self.assertEqual([i.id for i in deduped], ["b"])
self.assertEqual(report["input_count"], 2)
self.assertEqual(report["output_count"], 1)
self.assertEqual(report["removed_count"], 1)
self.assertEqual(report["groups"][0]["reason"], "same_canonical_url")
self.assertEqual(deduped[0].duplicate_sources[0]["source_group"], "RSS")
def test_hard_dedup_marks_similar_titles_without_removing(self):
items = [
item("a", "Grok API 上线 Cloudflare Gateway", "grokapi上线cloudflaregateway", "https://x.com/a", "https://x.com/a"),
item("b", "Grok 模型登陆 Cloudflare AI Gateway", "grok模型登陆cloudflareaigateway", "https://x.com/b", "https://x.com/b"),
]
deduped, report = hard_dedup_items(items)
self.assertEqual(len(deduped), 2)
self.assertEqual(report["removed_count"], 0)
self.assertEqual(len(report["possible_duplicates"]), 1)
self.assertEqual(set(report["possible_duplicates"][0]["item_ids"]), {"a", "b"})
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,129 @@
import json
import unittest
from ai_daily_report.models import NewsItem
from ai_daily_report.semantic_dedupe import semantic_dedup_items
def news_item(item_id, title, source_group="AI HOT"):
return NewsItem(
id=item_id,
source_group=source_group,
source_label=source_group,
source_role="primary" if source_group == "AI HOT" else "supplement",
source_priority=10 if source_group == "AI HOT" else 50,
title_raw=title,
title_norm=title.lower(),
summary_raw=f"{title} summary",
url=f"https://example.com/{item_id}",
canonical_url=f"https://example.com/{item_id}",
)
class Stage3SemanticDedupeTests(unittest.TestCase):
def test_semantic_dedup_removes_only_high_confidence_duplicates(self):
items = [
news_item("a", "Anthropic 提交 IPO 招股书", "AI HOT"),
news_item("b", "刚刚Anthropic 提交了招股书", "量子位"),
news_item("c", "Grok 上线 Cloudflare Gateway", "AI HOT"),
]
candidates = [{"item_ids": ["a", "b"], "reason": "title_similarity"}]
def llm_call(prompt):
return json.dumps(
{
"duplicate_groups": [
{
"keep_id": "a",
"remove_ids": ["b"],
"confidence": "high",
"reason": "same IPO filing event",
}
],
"not_duplicates": [],
"uncertain": [],
}
)
deduped, report = semantic_dedup_items(items, candidates, llm_call=llm_call)
self.assertEqual([item.id for item in deduped], ["a", "c"])
self.assertEqual(report["removed_count"], 1)
self.assertEqual(report["duplicate_groups"][0]["reason"], "same IPO filing event")
self.assertEqual(deduped[0].duplicate_sources[0]["id"], "b")
def test_semantic_dedup_skips_deletion_when_ratio_exceeds_limit(self):
items = [
news_item("a", "A"),
news_item("b", "B"),
news_item("c", "C"),
]
candidates = [{"item_ids": ["a", "b", "c"], "reason": "llm_candidate"}]
def llm_call(prompt):
return json.dumps(
{
"duplicate_groups": [
{
"keep_id": "a",
"remove_ids": ["b", "c"],
"confidence": "high",
"reason": "too broad",
}
],
"not_duplicates": [],
"uncertain": [],
}
)
deduped, report = semantic_dedup_items(
items,
candidates,
llm_call=llm_call,
max_deletion_ratio=0.5,
)
self.assertEqual(len(deduped), 3)
self.assertEqual(report["removed_count"], 0)
self.assertTrue(report["skipped_for_deletion_ratio"])
def test_semantic_dedup_ignores_groups_outside_candidate_sets(self):
items = [
news_item("a", "Suno 完成融资"),
news_item("b", "Suno 完成 D 轮融资"),
news_item("c", "Ideogram 发布 v4"),
news_item("d", "OpenClaw 发布新版"),
]
candidates = [{"item_ids": ["a", "b"], "reason": "title_similarity"}]
def llm_call(prompt):
return json.dumps(
{
"duplicate_groups": [
{
"keep_id": "a",
"remove_ids": ["b"],
"confidence": "high",
"reason": "same Suno event",
},
{
"keep_id": "c",
"remove_ids": ["d"],
"confidence": "high",
"reason": "not part of candidates",
},
],
"not_duplicates": [],
"uncertain": [],
}
)
deduped, report = semantic_dedup_items(items, candidates, llm_call=llm_call)
self.assertEqual([item.id for item in deduped], ["a", "c", "d"])
self.assertEqual(report["removed_count"], 1)
self.assertIn("group_outside_candidates", report["errors"][0])
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,96 @@
import json
import unittest
from ai_daily_report.models import NewsItem
from ai_daily_report.rewrite import rewrite_items
def news_item(item_id="a"):
return NewsItem(
id=item_id,
source_group="AI HOT",
source_label="AI HOT",
source_role="primary",
source_priority=10,
title_raw="OpenAI launches GPT-5 API",
title_norm="openailaunchesgpt5api",
summary_raw="OpenAI launched the GPT-5 API with better latency.",
url="https://example.com/a",
canonical_url="https://example.com/a",
)
class Stage4RewriteTests(unittest.TestCase):
def test_rewrite_items_writes_display_fields_without_overwriting_raw(self):
items = [news_item("a")]
def llm_call(prompt):
return json.dumps(
{
"rewrites": [
{
"id": "a",
"title": "OpenAI 发布 GPT-5 API",
"summary": "OpenAI 发布 GPT-5 API延迟表现更好。",
"flags": [],
}
]
},
ensure_ascii=False,
)
rewritten, report = rewrite_items(items, llm_call=llm_call, batch_size=10)
self.assertEqual(rewritten[0].title, "OpenAI 发布 GPT-5 API")
self.assertEqual(rewritten[0].summary, "OpenAI 发布 GPT-5 API延迟表现更好。")
self.assertEqual(rewritten[0].title_raw, "OpenAI launches GPT-5 API")
self.assertEqual(report["rewritten_count"], 1)
self.assertEqual(report["fallback_count"], 0)
def test_rewrite_items_falls_back_when_llm_fails(self):
items = [news_item("a")]
def llm_call(prompt):
raise TimeoutError("slow")
rewritten, report = rewrite_items(items, llm_call=llm_call, batch_size=10)
self.assertEqual(rewritten[0].title, "OpenAI launches GPT-5 API")
self.assertEqual(rewritten[0].summary, "OpenAI launched the GPT-5 API with better latency.")
self.assertEqual(report["rewritten_count"], 0)
self.assertEqual(report["fallback_count"], 1)
self.assertIn("TimeoutError", report["errors"][0])
def test_rewrite_items_retries_failed_batch_as_single_items(self):
items = [news_item("a"), news_item("b")]
calls = []
def llm_call(prompt):
payload = json.loads(prompt)
ids = [item["id"] for item in payload["items"]]
calls.append(ids)
if len(ids) > 1:
return "not json"
return json.dumps(
{
"rewrites": [
{
"id": ids[0],
"title": f"title {ids[0]}",
"summary": f"summary {ids[0]}",
"flags": [],
}
]
}
)
rewritten, report = rewrite_items(items, llm_call=llm_call, batch_size=2)
self.assertEqual([item.title for item in rewritten], ["title a", "title b"])
self.assertEqual(report["rewritten_count"], 2)
self.assertEqual(report["fallback_count"], 0)
self.assertEqual(calls, [["a", "b"], ["a"], ["b"]])
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,61 @@
import unittest
from ai_daily_report.classify import SECTION_ORDER, classify_and_order_items
from ai_daily_report.models import NewsItem
def news_item(item_id, title, summary="", section_hint="", source_priority=50):
return NewsItem(
id=item_id,
source_group="AI HOT",
source_label="AI HOT",
source_role="primary",
source_priority=source_priority,
title_raw=title,
title_norm=title.lower(),
summary_raw=summary or f"{title} summary",
title=title,
summary=summary or f"{title} summary",
url=f"https://example.com/{item_id}",
canonical_url=f"https://example.com/{item_id}",
section_hint=section_hint,
)
class Stage5ClassifyTests(unittest.TestCase):
def test_classify_maps_legacy_section_hints_to_new_sections(self):
items = [news_item("a", "GPT-5 发布", section_hint="模型发布/更新")]
classified, report = classify_and_order_items(items)
self.assertEqual(classified[0].section, "模型与能力")
self.assertEqual(report["hint_classified"], 1)
self.assertIn("模型与能力", SECTION_ORDER)
def test_classify_uses_rules_when_hint_is_missing(self):
items = [
news_item("a", "Anthropic 提交 IPO 文件", summary="Anthropic 计划上市并提交文件。"),
news_item("b", "MCP SDK 发布新版", summary="开发者可用新版 SDK 构建工具。"),
]
classified, report = classify_and_order_items(items)
by_id = {item.id: item for item in classified}
self.assertEqual(by_id["a"].section, "公司与资本")
self.assertEqual(by_id["b"].section, "开发与基础设施")
self.assertEqual(report["rule_classified"], 2)
def test_classify_orders_items_by_local_rank_score_within_sections(self):
items = [
news_item("low", "普通模型更新", section_hint="模型发布/更新", source_priority=80),
news_item("high", "GPT-5 API 发布,延迟降低 30%", section_hint="模型发布/更新", source_priority=10),
]
classified, report = classify_and_order_items(items)
self.assertEqual([item.id for item in classified], ["high", "low"])
self.assertEqual(report["section_counts"]["模型与能力"], 2)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,77 @@
import json
import unittest
from ai_daily_report.guide import generate_guide
from ai_daily_report.models import NewsItem
def news_item(item_id, title, section="模型与能力"):
return NewsItem(
id=item_id,
source_group="AI HOT",
source_label="AI HOT",
source_role="primary",
source_priority=10,
title_raw=title,
title_norm=title.lower(),
summary_raw=f"{title} summary",
title=title,
summary=f"{title} summary",
url=f"https://example.com/{item_id}",
canonical_url=f"https://example.com/{item_id}",
section=section,
)
class Stage6GuideTests(unittest.TestCase):
def test_generate_guide_returns_theme_and_valid_threads(self):
items = [
news_item("a", "GPT-5 API 发布"),
news_item("b", "Miso One 开源语音模型"),
]
def llm_call(prompt):
return json.dumps(
{
"theme": "模型能力继续向 API 和实时语音两端推进。",
"threads": [
{
"title": "模型能力继续推进",
"text": "GPT-5 API 和 Miso One 分别代表 API 能力和语音模型更新。",
"item_ids": ["a", "b"],
"kind": "thread",
},
{
"title": "无效脉络",
"text": "这条引用了不存在的条目。",
"item_ids": ["missing"],
"kind": "thread",
},
],
},
ensure_ascii=False,
)
guide, report = generate_guide(items, llm_call=llm_call)
self.assertEqual(guide["theme"], "模型能力继续向 API 和实时语音两端推进。")
self.assertEqual(len(guide["threads"]), 1)
self.assertEqual(guide["threads"][0]["item_ids"], ["a", "b"])
self.assertEqual(report["dropped_thread_count"], 1)
def test_generate_guide_falls_back_when_llm_fails(self):
items = [news_item("a", "GPT-5 API 发布")]
def llm_call(prompt):
raise TimeoutError("slow")
guide, report = generate_guide(items, llm_call=llm_call)
self.assertEqual(guide["theme"], "")
self.assertEqual(guide["threads"], [])
self.assertTrue(report["fallback_used"])
self.assertIn("TimeoutError", report["errors"][0])
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,65 @@
import unittest
from ai_daily_report.assemble import assemble_markdown, validate_markdown
from ai_daily_report.models import NewsItem
def news_item(item_id, title, section):
return NewsItem(
id=item_id,
source_group="AI HOT",
source_label="OpenAIBlog",
source_role="primary",
source_priority=10,
title_raw=title,
title_norm=title.lower(),
summary_raw=f"{title} summary",
title=title,
summary=f"{title} summary",
url=f"https://example.com/{item_id}",
canonical_url=f"https://example.com/{item_id}",
section=section,
)
class Stage7AssembleTests(unittest.TestCase):
def test_assemble_markdown_renders_sections_and_daily_threads(self):
items = [
news_item("a", "GPT-5 API 发布", "模型与能力"),
news_item("b", "Anthropic 提交 IPO 文件", "公司与资本"),
]
guide = {
"theme": "> 模型和资本两条线都在推进。[1]",
"threads": [
{
"title": "模型能力产品化",
"text": "GPT-5 API 发布,说明模型能力继续进入产品入口。",
"item_ids": ["a"],
"kind": "thread",
}
],
}
md, report = assemble_markdown(items, guide)
self.assertIn("## 导览", md)
self.assertIn("> 模型和资本两条线都在推进。", md)
self.assertIn("## 模型与能力", md)
self.assertIn("**1. GPT-5 API 发布**", md)
self.assertIn("**2. Anthropic 提交 IPO 文件**", md)
self.assertIn("## 今日脉络", md)
self.assertIn("- **模型能力产品化**", md)
self.assertNotIn("> >", md)
self.assertNotIn("[1]", md)
self.assertEqual(report["item_count"], 2)
self.assertEqual(report["blocking_errors"], [])
def test_validate_markdown_blocks_empty_report(self):
report = validate_markdown("", [])
self.assertIn("no_items", report["blocking_errors"])
self.assertIn("markdown_too_short", report["blocking_errors"])
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,76 @@
import unittest
from ai_daily_report.publish import publish_markdown
class FakeBlogClient:
def __init__(self):
self.created_payload = None
self.published_slug = None
def create_post(self, payload):
self.created_payload = payload
return {"slug": "ai-2026-06-04"}
def publish_post(self, slug):
self.published_slug = slug
class Stage8PublishTests(unittest.TestCase):
def test_publish_markdown_dry_run_does_not_call_client(self):
result = publish_markdown(
title="AI日报 · 2026-06-04",
markdown="## 导览\n\n> ok",
tags=["AI日报"],
slug="ai-2026-06-04",
base_url="https://blog.example",
mode="dry-run",
markdown_report={"blocking_errors": []},
client=None,
)
self.assertEqual(result.status, "ok")
self.assertEqual(result.mode, "dry-run")
self.assertEqual(result.blog_url, "https://blog.example/posts/ai-2026-06-04")
self.assertTrue(result.public_ok)
def test_publish_markdown_blocks_when_markdown_has_errors(self):
client = FakeBlogClient()
result = publish_markdown(
title="AI日报 · 2026-06-04",
markdown="bad",
tags=["AI日报"],
slug="ai-2026-06-04",
base_url="https://blog.example",
mode="publish",
markdown_report={"blocking_errors": ["markdown_too_short"]},
client=client,
)
self.assertEqual(result.status, "blocked")
self.assertIsNone(client.created_payload)
self.assertIn("markdown_too_short", result.error)
def test_publish_markdown_publish_mode_calls_client(self):
client = FakeBlogClient()
result = publish_markdown(
title="AI日报 · 2026-06-04",
markdown="## 导览\n\n> ok",
tags=["AI日报"],
slug="ai-2026-06-04",
base_url="https://blog.example",
mode="publish",
markdown_report={"blocking_errors": []},
client=client,
)
self.assertEqual(result.status, "ok")
self.assertEqual(client.created_payload["title"], "AI日报 · 2026-06-04")
self.assertEqual(client.published_slug, "ai-2026-06-04")
self.assertEqual(result.blog_url, "https://blog.example/posts/ai-2026-06-04")
if __name__ == "__main__":
unittest.main()

14
tests/test_validate.py Normal file
View File

@@ -0,0 +1,14 @@
import unittest
from ai_daily_report.validate import validate_report_markdown
class ValidateTests(unittest.TestCase):
def test_validate_report_markdown_delegates_markdown_checks(self):
report = validate_report_markdown("", [])
self.assertIn("no_items", report["blocking_errors"])
if __name__ == "__main__":
unittest.main()