Refactor AI daily report pipeline
This commit is contained in:
2
ai_daily_report/__init__.py
Normal file
2
ai_daily_report/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
"""Core package for the AI daily report pipeline."""
|
||||
|
||||
77
ai_daily_report/assemble.py
Normal file
77
ai_daily_report/assemble.py
Normal file
@@ -0,0 +1,77 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from .classify import SECTION_ORDER
|
||||
from .models import NewsItem
|
||||
from .validate import validate_markdown
|
||||
|
||||
|
||||
END_PUNCTUATION = "。!?;.!?;"
|
||||
|
||||
|
||||
def _clean_text(text: str) -> str:
|
||||
value = re.sub(r"^```(?:\w+)?\s*\n?", "", (text or "").strip())
|
||||
value = re.sub(r"\n?```\s*$", "", value)
|
||||
value = re.sub(r"^\s*>\s*", "", value)
|
||||
value = re.sub(r"\[\d+\]|\[N\]", "", value)
|
||||
value = re.sub(r"主线判断[::]\s*", "", value)
|
||||
value = re.sub(r"\s+", " ", value).strip()
|
||||
return value
|
||||
|
||||
|
||||
def _ensure_sentence(text: str) -> str:
|
||||
value = _clean_text(text)
|
||||
if value and value[-1] not in END_PUNCTUATION:
|
||||
value += "。"
|
||||
return value
|
||||
|
||||
|
||||
def _source_link(item: NewsItem) -> str:
|
||||
source = item.source_label or item.source_group or "来源"
|
||||
if item.url:
|
||||
return f"[{source} ↗]({item.url})"
|
||||
return source
|
||||
|
||||
|
||||
def assemble_markdown(items: list[NewsItem], guide: dict[str, Any] | None = None) -> tuple[str, dict[str, Any]]:
|
||||
guide = guide or {"theme": "", "threads": []}
|
||||
lines: list[str] = []
|
||||
|
||||
theme = _clean_text(str(guide.get("theme") or ""))
|
||||
if theme:
|
||||
lines.extend(["## 导览", "", f"> {theme}", ""])
|
||||
|
||||
item_number = 1
|
||||
for section in SECTION_ORDER:
|
||||
section_items = [item for item in items if item.section == section]
|
||||
if not section_items:
|
||||
continue
|
||||
lines.extend([f"## {section}", ""])
|
||||
for item in section_items:
|
||||
title = _clean_text(item.title or item.title_raw)
|
||||
summary = _ensure_sentence(item.summary or item.summary_raw or "该条目暂无摘要。")
|
||||
lines.extend(
|
||||
[
|
||||
f"**{item_number}. {title}**",
|
||||
"",
|
||||
f"> {summary}{_source_link(item)}",
|
||||
"",
|
||||
]
|
||||
)
|
||||
item_number += 1
|
||||
|
||||
threads = guide.get("threads", []) or []
|
||||
if threads:
|
||||
lines.extend(["## 今日脉络", ""])
|
||||
for thread in threads:
|
||||
title = _clean_text(str(thread.get("title") or ""))
|
||||
text = _ensure_sentence(str(thread.get("text") or ""))
|
||||
if not title or not text:
|
||||
continue
|
||||
lines.extend([f"- **{title}**", f" {text}", ""])
|
||||
|
||||
markdown = "\n".join(lines).strip()
|
||||
report = validate_markdown(markdown, items)
|
||||
return markdown, report
|
||||
109
ai_daily_report/classify.py
Normal file
109
ai_daily_report/classify.py
Normal file
@@ -0,0 +1,109 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import Counter
|
||||
from typing import Any
|
||||
|
||||
from .models import NewsItem
|
||||
|
||||
|
||||
SECTION_ORDER = [
|
||||
"模型与能力",
|
||||
"产品与应用",
|
||||
"开发与基础设施",
|
||||
"公司与资本",
|
||||
"政策与安全",
|
||||
"论文与研究",
|
||||
"观点与教程",
|
||||
"人物与动态",
|
||||
]
|
||||
|
||||
SECTION_ALIASES = {
|
||||
"模型发布/更新": "模型与能力",
|
||||
"产品发布/更新": "产品与应用",
|
||||
"产品与工具": "产品与应用",
|
||||
"开发与工程": "开发与基础设施",
|
||||
"行业动态": "公司与资本",
|
||||
"行业与公司": "公司与资本",
|
||||
"论文研究": "论文与研究",
|
||||
"论文与研究": "论文与研究",
|
||||
"技巧与观点": "观点与教程",
|
||||
"观点与教程": "观点与教程",
|
||||
"人物与花絮": "人物与动态",
|
||||
}
|
||||
|
||||
|
||||
RULES = [
|
||||
("政策与安全", ("监管", "政策", "安全", "风险", "滥用", "攻击", "合规", "版权")),
|
||||
("论文与研究", ("论文", "研究", "arxiv", "cvpr", "benchmark", "评测", "实验")),
|
||||
("开发与基础设施", ("sdk", "api", "mcp", "kubernetes", "框架", "开源", "github", "部署", "基础设施")),
|
||||
("公司与资本", ("融资", "ipo", "上市", "招股书", "合作", "估值", "收购", "资本")),
|
||||
("模型与能力", ("模型", "gpt", "claude", "gemini", "grok", "token", "参数", "多模态", "语音", "推理")),
|
||||
("产品与应用", ("agent", "应用", "产品", "平台", "上线", "工具", "智能体")),
|
||||
("观点与教程", ("教程", "观点", "方法论", "guide", "实践", "技巧")),
|
||||
("人物与动态", ("黄仁勋", "纳德拉", "访谈", "演讲", "人物")),
|
||||
]
|
||||
|
||||
|
||||
def normalize_section_hint(section_hint: str) -> str:
|
||||
hint = (section_hint or "").strip()
|
||||
if hint in SECTION_ORDER:
|
||||
return hint
|
||||
return SECTION_ALIASES.get(hint, "")
|
||||
|
||||
|
||||
def rule_classify(item: NewsItem) -> str:
|
||||
text = f"{item.title or item.title_raw} {item.summary or item.summary_raw}".lower()
|
||||
for section, keywords in RULES:
|
||||
if any(keyword.lower() in text for keyword in keywords):
|
||||
return section
|
||||
return "公司与资本"
|
||||
|
||||
|
||||
def rank_score(item: NewsItem) -> int:
|
||||
text = f"{item.title or item.title_raw} {item.summary or item.summary_raw}"
|
||||
score = max(0, 200 - item.source_priority)
|
||||
if item.source_role == "primary":
|
||||
score += 10
|
||||
if item.canonical_url:
|
||||
score += 10
|
||||
if any(ch.isdigit() for ch in text):
|
||||
score += 10
|
||||
if item.duplicate_sources:
|
||||
score += min(20, len(item.duplicate_sources) * 5)
|
||||
score -= len(item.quality_flags) * 10
|
||||
return score
|
||||
|
||||
|
||||
def classify_and_order_items(items: list[NewsItem]) -> tuple[list[NewsItem], dict[str, Any]]:
|
||||
hint_classified = 0
|
||||
rule_classified = 0
|
||||
|
||||
for item in items:
|
||||
mapped = normalize_section_hint(item.section_hint)
|
||||
if mapped:
|
||||
item.section = mapped
|
||||
hint_classified += 1
|
||||
else:
|
||||
item.section = rule_classify(item)
|
||||
rule_classified += 1
|
||||
|
||||
section_index = {section: index for index, section in enumerate(SECTION_ORDER)}
|
||||
ordered = sorted(
|
||||
items,
|
||||
key=lambda item: (
|
||||
section_index.get(item.section or "", len(SECTION_ORDER)),
|
||||
-rank_score(item),
|
||||
item.title or item.title_raw,
|
||||
),
|
||||
)
|
||||
section_counts = Counter(item.section for item in ordered if item.section)
|
||||
report = {
|
||||
"input_count": len(items),
|
||||
"section_counts": dict(section_counts),
|
||||
"hint_classified": hint_classified,
|
||||
"rule_classified": rule_classified,
|
||||
"llm_classified": 0,
|
||||
"fallback_classified": 0,
|
||||
"invalid_section_count": sum(1 for item in ordered if item.section not in SECTION_ORDER),
|
||||
}
|
||||
return ordered, report
|
||||
40
ai_daily_report/cli.py
Normal file
40
ai_daily_report/cli.py
Normal file
@@ -0,0 +1,40 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
from .runner import run_daily_report
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(prog="ai-daily-report")
|
||||
subcommands = parser.add_subparsers(dest="command")
|
||||
run = subcommands.add_parser("run")
|
||||
run.add_argument("--date", default="today")
|
||||
run.add_argument("--mode", choices=["dry-run", "draft", "publish"], default="dry-run")
|
||||
run.add_argument("--source-mode", choices=["mock", "live"], default="mock")
|
||||
run.add_argument("--llm-mode", choices=["mock", "live"], default="mock")
|
||||
run.add_argument("--out-dir", default="runs")
|
||||
run.add_argument("--base-url", default="https://blog.ephron.ren")
|
||||
run.add_argument("--sources-path", default=None)
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
if args.command == "run":
|
||||
run_daily_report(
|
||||
run_date=args.date,
|
||||
mode=args.mode,
|
||||
source_mode=args.source_mode,
|
||||
llm_mode=args.llm_mode,
|
||||
out_dir=Path(args.out_dir),
|
||||
base_url=args.base_url,
|
||||
sources_path=Path(args.sources_path) if args.sources_path else None,
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
64
ai_daily_report/clients.py
Normal file
64
ai_daily_report/clients.py
Normal file
@@ -0,0 +1,64 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import urllib.request
|
||||
from typing import Any
|
||||
|
||||
|
||||
UA = "Mozilla/5.0 (compatible; ai-daily-report/1.0)"
|
||||
|
||||
|
||||
def fetch_text(url: str, timeout_seconds: int) -> str:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=timeout_seconds) as response:
|
||||
return response.read().decode("utf-8", "ignore")
|
||||
|
||||
|
||||
class OpenAICompatibleClient:
|
||||
def __init__(self, *, api_key: str, base_url: str, model: str, timeout_seconds: int = 600):
|
||||
self.api_key = api_key
|
||||
self.base_url = base_url.rstrip("/")
|
||||
self.model = model
|
||||
self.timeout_seconds = timeout_seconds
|
||||
|
||||
def chat(self, prompt: str) -> str:
|
||||
payload = json.dumps(
|
||||
{
|
||||
"model": self.model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.2,
|
||||
"max_tokens": 8000,
|
||||
},
|
||||
ensure_ascii=False,
|
||||
).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
f"{self.base_url}/chat/completions",
|
||||
data=payload,
|
||||
headers={"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"},
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=self.timeout_seconds) as response:
|
||||
data = json.loads(response.read().decode("utf-8"))
|
||||
return data["choices"][0]["message"]["content"].strip()
|
||||
|
||||
|
||||
class BlogApiClient:
|
||||
def __init__(self, *, base_url: str, token: str, timeout_seconds: int = 25):
|
||||
self.base_url = base_url.rstrip("/")
|
||||
self.token = token
|
||||
self.timeout_seconds = timeout_seconds
|
||||
|
||||
def _request(self, method: str, path: str, payload: dict[str, Any] | None = None) -> dict[str, Any]:
|
||||
data = None
|
||||
headers = {"Authorization": f"Bearer {self.token}", "User-Agent": UA}
|
||||
if payload is not None:
|
||||
data = json.dumps(payload, ensure_ascii=False).encode("utf-8")
|
||||
headers["Content-Type"] = "application/json"
|
||||
req = urllib.request.Request(f"{self.base_url}{path}", data=data, headers=headers, method=method)
|
||||
with urllib.request.urlopen(req, timeout=self.timeout_seconds) as response:
|
||||
return json.loads(response.read().decode("utf-8"))
|
||||
|
||||
def create_post(self, payload: dict[str, Any]) -> dict[str, Any]:
|
||||
return self._request("POST", "/api/service/posts", payload)
|
||||
|
||||
def publish_post(self, slug: str) -> None:
|
||||
self._request("POST", f"/api/service/posts/{slug}/publish")
|
||||
95
ai_daily_report/collect.py
Normal file
95
ai_daily_report/collect.py
Normal file
@@ -0,0 +1,95 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from datetime import datetime, timezone
|
||||
from time import perf_counter
|
||||
from typing import Callable, Iterable, Any
|
||||
|
||||
from .models import SourceConfig, SourceResult
|
||||
|
||||
|
||||
Fetcher = Callable[[SourceConfig, str], list[dict[str, Any]]]
|
||||
|
||||
|
||||
def _status_from_exception(exc: Exception) -> str:
|
||||
if isinstance(exc, TimeoutError):
|
||||
return "timeout"
|
||||
return "error"
|
||||
|
||||
|
||||
def _collect_one(config: SourceConfig, run_date: str, fetcher: Fetcher) -> SourceResult:
|
||||
fetched_at = datetime.now(timezone.utc).isoformat()
|
||||
if not config.enabled:
|
||||
return SourceResult(
|
||||
source=config.name,
|
||||
role=config.role,
|
||||
ok=False,
|
||||
status="disabled",
|
||||
fetched_at=fetched_at,
|
||||
)
|
||||
|
||||
started = perf_counter()
|
||||
try:
|
||||
items = fetcher(config, run_date)
|
||||
elapsed_ms = int((perf_counter() - started) * 1000)
|
||||
status = "ok" if items else "empty"
|
||||
return SourceResult(
|
||||
source=config.name,
|
||||
role=config.role,
|
||||
ok=status == "ok",
|
||||
status=status,
|
||||
items=items,
|
||||
elapsed_ms=elapsed_ms,
|
||||
fetched_at=fetched_at,
|
||||
)
|
||||
except Exception as exc:
|
||||
elapsed_ms = int((perf_counter() - started) * 1000)
|
||||
return SourceResult(
|
||||
source=config.name,
|
||||
role=config.role,
|
||||
ok=False,
|
||||
status=_status_from_exception(exc),
|
||||
error=f"{type(exc).__name__}: {exc}",
|
||||
elapsed_ms=elapsed_ms,
|
||||
fetched_at=fetched_at,
|
||||
)
|
||||
|
||||
|
||||
def collect_sources(
|
||||
configs: Iterable[SourceConfig],
|
||||
run_date: str,
|
||||
*,
|
||||
fetcher: Fetcher,
|
||||
max_workers: int | None = None,
|
||||
) -> tuple[list[SourceResult], dict[str, Any]]:
|
||||
ordered_configs = list(configs)
|
||||
if not ordered_configs:
|
||||
return [], {
|
||||
"input_source_count": 0,
|
||||
"ok_source_count": 0,
|
||||
"failed_source_count": 0,
|
||||
"raw_item_count": 0,
|
||||
}
|
||||
|
||||
workers = max_workers or min(8, len(ordered_configs))
|
||||
result_by_name: dict[str, SourceResult] = {}
|
||||
|
||||
with ThreadPoolExecutor(max_workers=workers) as executor:
|
||||
futures = {
|
||||
executor.submit(_collect_one, config, run_date, fetcher): config
|
||||
for config in ordered_configs
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
config = futures[future]
|
||||
result_by_name[config.name] = future.result()
|
||||
|
||||
results = [result_by_name[config.name] for config in ordered_configs]
|
||||
report = {
|
||||
"input_source_count": len(results),
|
||||
"ok_source_count": sum(1 for result in results if result.ok),
|
||||
"failed_source_count": sum(1 for result in results if not result.ok),
|
||||
"raw_item_count": sum(len(result.items) for result in results),
|
||||
"source_counts": {result.source: len(result.items) for result in results},
|
||||
"statuses": {result.source: result.status for result in results},
|
||||
}
|
||||
return results, report
|
||||
19
ai_daily_report/config.py
Normal file
19
ai_daily_report/config.py
Normal file
@@ -0,0 +1,19 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .models import SourceConfig
|
||||
from .pipeline import _source_config_from_dict
|
||||
|
||||
|
||||
def load_json(path: Path) -> Any:
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def load_source_configs(path: Path) -> list[SourceConfig]:
|
||||
raw = load_json(path)
|
||||
if not isinstance(raw, list):
|
||||
raise ValueError("sources config must be a list")
|
||||
return [_source_config_from_dict(item) for item in raw]
|
||||
100
ai_daily_report/dedupe.py
Normal file
100
ai_daily_report/dedupe.py
Normal file
@@ -0,0 +1,100 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import difflib
|
||||
from typing import Any
|
||||
|
||||
from .models import NewsItem
|
||||
|
||||
|
||||
def _item_score(item: NewsItem) -> int:
|
||||
score = 0
|
||||
score += max(0, 200 - item.source_priority)
|
||||
if item.canonical_url:
|
||||
score += 20
|
||||
if item.summary_raw:
|
||||
score += min(40, len(item.summary_raw))
|
||||
if item.section_hint:
|
||||
score += 10
|
||||
if item.source_role == "primary":
|
||||
score += 10
|
||||
score -= len(item.quality_flags) * 10
|
||||
return score
|
||||
|
||||
|
||||
def _merge_group(group: list[NewsItem], reason: str) -> tuple[NewsItem, list[NewsItem], dict[str, Any]]:
|
||||
keep = max(group, key=_item_score)
|
||||
removed = [item for item in group if item is not keep]
|
||||
for removed_item in removed:
|
||||
keep.duplicate_sources.append(
|
||||
{
|
||||
"id": removed_item.id,
|
||||
"source_group": removed_item.source_group,
|
||||
"source_label": removed_item.source_label,
|
||||
"url": removed_item.url,
|
||||
"reason": reason,
|
||||
}
|
||||
)
|
||||
report_group = {
|
||||
"reason": reason,
|
||||
"keep_id": keep.id,
|
||||
"removed_ids": [item.id for item in removed],
|
||||
"confidence": "high",
|
||||
}
|
||||
return keep, removed, report_group
|
||||
|
||||
|
||||
def _group_by_key(items: list[NewsItem], key_name: str) -> dict[str, list[NewsItem]]:
|
||||
groups: dict[str, list[NewsItem]] = {}
|
||||
for item in items:
|
||||
key = getattr(item, key_name)
|
||||
if key:
|
||||
groups.setdefault(key, []).append(item)
|
||||
return {key: group for key, group in groups.items() if len(group) > 1}
|
||||
|
||||
|
||||
def _possible_duplicates(items: list[NewsItem]) -> list[dict[str, Any]]:
|
||||
possible: list[dict[str, Any]] = []
|
||||
for index, left in enumerate(items):
|
||||
for right in items[index + 1 :]:
|
||||
if not left.title_norm or not right.title_norm:
|
||||
continue
|
||||
ratio = difflib.SequenceMatcher(None, left.title_norm, right.title_norm).ratio()
|
||||
if ratio >= 0.65:
|
||||
possible.append(
|
||||
{
|
||||
"item_ids": [left.id, right.id],
|
||||
"reason": "title_similarity",
|
||||
"similarity": round(ratio, 3),
|
||||
"confidence": "medium",
|
||||
}
|
||||
)
|
||||
return possible
|
||||
|
||||
|
||||
def hard_dedup_items(items: list[NewsItem]) -> tuple[list[NewsItem], dict[str, Any]]:
|
||||
remaining = list(items)
|
||||
removed_object_ids: set[int] = set()
|
||||
groups_report: list[dict[str, Any]] = []
|
||||
|
||||
for key_name, reason in (
|
||||
("canonical_url", "same_canonical_url"),
|
||||
("title_norm", "same_title_norm"),
|
||||
):
|
||||
grouped = _group_by_key([item for item in remaining if id(item) not in removed_object_ids], key_name)
|
||||
for group in grouped.values():
|
||||
active_group = [item for item in group if id(item) not in removed_object_ids]
|
||||
if len(active_group) < 2:
|
||||
continue
|
||||
keep, removed, report_group = _merge_group(active_group, reason)
|
||||
removed_object_ids.update(id(item) for item in removed)
|
||||
groups_report.append(report_group)
|
||||
|
||||
deduped = [item for item in remaining if id(item) not in removed_object_ids]
|
||||
report = {
|
||||
"input_count": len(items),
|
||||
"output_count": len(deduped),
|
||||
"removed_count": len(removed_object_ids),
|
||||
"groups": groups_report,
|
||||
"possible_duplicates": _possible_duplicates(deduped),
|
||||
}
|
||||
return deduped, report
|
||||
143
ai_daily_report/env.py
Normal file
143
ai_daily_report/env.py
Normal file
@@ -0,0 +1,143 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
||||
|
||||
|
||||
def read_env_file(env_path: Path) -> dict[str, str]:
|
||||
env: dict[str, str] = {}
|
||||
if not env_path.exists():
|
||||
return env
|
||||
text = env_path.read_text(encoding="utf-8", errors="ignore")
|
||||
for line in text.splitlines():
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#") or "=" not in line:
|
||||
continue
|
||||
key, value = line.split("=", 1)
|
||||
env[key.strip()] = value.strip().strip('"').strip("'")
|
||||
return env
|
||||
|
||||
|
||||
def load_env() -> dict[str, str]:
|
||||
env: dict[str, str] = {}
|
||||
env.update(read_env_file(PROJECT_ROOT / ".env"))
|
||||
env.update(read_env_file(Path.home() / ".hermes" / ".env"))
|
||||
env.update({key: value for key, value in os.environ.items() if value})
|
||||
return env
|
||||
|
||||
|
||||
def first_env(env: dict[str, str], *names: str) -> str:
|
||||
for name in names:
|
||||
value = (env.get(name) or "").strip()
|
||||
if value:
|
||||
return value
|
||||
return ""
|
||||
|
||||
|
||||
def _load_simple_yaml(path: Path) -> dict[str, object]:
|
||||
if not path.exists():
|
||||
return {}
|
||||
root: dict[str, object] = {}
|
||||
stack: list[tuple[int, dict[str, object]]] = [(-1, root)]
|
||||
for raw_line in path.read_text(encoding="utf-8", errors="ignore").splitlines():
|
||||
if not raw_line.strip() or raw_line.lstrip().startswith("#") or ":" not in raw_line:
|
||||
continue
|
||||
indent = len(raw_line) - len(raw_line.lstrip(" "))
|
||||
key, value = raw_line.strip().split(":", 1)
|
||||
key = key.strip()
|
||||
value = value.strip().strip('"').strip("'")
|
||||
while stack and indent <= stack[-1][0]:
|
||||
stack.pop()
|
||||
current = stack[-1][1]
|
||||
if value:
|
||||
current[key] = value
|
||||
else:
|
||||
child: dict[str, object] = {}
|
||||
current[key] = child
|
||||
stack.append((indent, child))
|
||||
return root
|
||||
|
||||
|
||||
def _env_with_hermes(env: dict[str, str], hermes_dir: Path) -> dict[str, str]:
|
||||
merged = dict(read_env_file(hermes_dir / ".env"))
|
||||
merged.update(env)
|
||||
return merged
|
||||
|
||||
|
||||
def _provider_env_names(provider: str) -> tuple[str, str, str]:
|
||||
prefix = provider.upper().replace("-", "_")
|
||||
return f"{prefix}_API_KEY", f"{prefix}_BASE_URL", f"{prefix}_MODEL"
|
||||
|
||||
|
||||
def _auth_json_key(env: dict[str, str], hermes_dir: Path, provider: str) -> str:
|
||||
auth_path = hermes_dir / "auth.json"
|
||||
if not auth_path.exists() or not provider:
|
||||
return ""
|
||||
try:
|
||||
auth = json.loads(auth_path.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
return ""
|
||||
pool = auth.get("credential_pool", {}) or {}
|
||||
provider_keys = [provider, provider.replace("-", "_")]
|
||||
for key in provider_keys:
|
||||
creds = pool.get(key, []) or []
|
||||
if not creds:
|
||||
continue
|
||||
cred = creds[0]
|
||||
source = str(cred.get("source") or "")
|
||||
if source.startswith("env:"):
|
||||
resolved = first_env(env, source[4:])
|
||||
if resolved:
|
||||
return resolved
|
||||
token = str(cred.get("access_token") or "").strip()
|
||||
if token:
|
||||
return token
|
||||
return ""
|
||||
|
||||
|
||||
def resolve_llm_config(env: dict[str, str], *, hermes_dir: Path | None = None) -> dict[str, str]:
|
||||
hermes_dir = hermes_dir or Path.home() / ".hermes"
|
||||
env = _env_with_hermes(env, hermes_dir)
|
||||
hermes_config = _load_simple_yaml(hermes_dir / "config.yaml")
|
||||
model_config = hermes_config.get("model", {}) if isinstance(hermes_config.get("model"), dict) else {}
|
||||
provider = str(model_config.get("provider") or "").strip()
|
||||
provider_key, provider_base_url, provider_model = _provider_env_names(provider) if provider else ("", "", "")
|
||||
|
||||
api_key = first_env(env, "LLM_API_KEY")
|
||||
base_url = first_env(env, "LLM_BASE_URL")
|
||||
model = first_env(env, "LLM_MODEL")
|
||||
|
||||
if not api_key and provider:
|
||||
api_key = first_env(env, provider_key) or _auth_json_key(env, hermes_dir, provider)
|
||||
if not base_url and provider:
|
||||
base_url = first_env(env, provider_base_url) or str(model_config.get("base_url") or "").strip()
|
||||
if not model and provider:
|
||||
model = first_env(env, provider_model) or str(model_config.get("default") or "").strip()
|
||||
|
||||
if not api_key:
|
||||
api_key = first_env(env, "SUB2API_API_KEY", "XIAOMI_API_KEY", "OPENROUTER_API_KEY")
|
||||
if not base_url:
|
||||
base_url = first_env(env, "SUB2API_BASE_URL", "XIAOMI_BASE_URL", "OPENROUTER_BASE_URL")
|
||||
if not model:
|
||||
model = first_env(env, "SUB2API_MODEL", "XIAOMI_MODEL")
|
||||
|
||||
missing = [
|
||||
name
|
||||
for name, value in (
|
||||
("LLM_API_KEY", api_key),
|
||||
("LLM_BASE_URL", base_url),
|
||||
("LLM_MODEL", model),
|
||||
)
|
||||
if not value
|
||||
]
|
||||
if missing:
|
||||
raise ValueError("missing_llm_config: " + ",".join(missing))
|
||||
return {"api_key": api_key, "base_url": base_url, "model": model}
|
||||
|
||||
|
||||
def resolve_blog_token(env: dict[str, str]) -> str:
|
||||
return first_env(env, "BLOG_SERVICE_TOKEN", "EPHRON_SERVICE_TOKEN")
|
||||
113
ai_daily_report/guide.py
Normal file
113
ai_daily_report/guide.py
Normal file
@@ -0,0 +1,113 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import Any, Callable
|
||||
|
||||
from .llm import parse_json_object
|
||||
from .models import NewsItem
|
||||
|
||||
|
||||
GuideLlmCall = Callable[[str], str]
|
||||
|
||||
|
||||
def _clean_text(text: str, limit: int | None = None) -> str:
|
||||
value = re.sub(r"^\s*>\s*", "", text or "").strip()
|
||||
value = re.sub(r"\[\d+\]|\[N\]", "", value)
|
||||
value = re.sub(r"\s+", " ", value).strip()
|
||||
if limit and len(value) > limit:
|
||||
value = value[:limit].rstrip()
|
||||
return value
|
||||
|
||||
|
||||
def _build_prompt(items: list[NewsItem]) -> str:
|
||||
payload = {
|
||||
"task": (
|
||||
"Generate a concise AI daily report guide. Return JSON only. Do not use 强信号/中信号/待验证. "
|
||||
"Use a short theme and 2-4 daily threads. Every thread must reference existing item_ids."
|
||||
),
|
||||
"items": [
|
||||
{
|
||||
"id": item.id,
|
||||
"title": item.title or item.title_raw,
|
||||
"summary": item.summary or item.summary_raw,
|
||||
"section": item.section,
|
||||
"source": item.source_label,
|
||||
}
|
||||
for item in items
|
||||
],
|
||||
"output_schema": {
|
||||
"theme": "one sentence under 120 Chinese characters",
|
||||
"threads": [
|
||||
{
|
||||
"title": "thread title",
|
||||
"text": "one or two sentences",
|
||||
"item_ids": ["existing item id"],
|
||||
"kind": "thread|uncertain",
|
||||
}
|
||||
],
|
||||
},
|
||||
}
|
||||
return json.dumps(payload, ensure_ascii=False)
|
||||
|
||||
|
||||
def generate_guide(
|
||||
items: list[NewsItem],
|
||||
*,
|
||||
llm_call: GuideLlmCall,
|
||||
) -> tuple[dict[str, Any], dict[str, Any]]:
|
||||
if not items:
|
||||
return {
|
||||
"theme": "",
|
||||
"threads": [],
|
||||
}, {
|
||||
"input_count": 0,
|
||||
"theme_present": False,
|
||||
"thread_count": 0,
|
||||
"dropped_thread_count": 0,
|
||||
"fallback_used": False,
|
||||
"errors": [],
|
||||
}
|
||||
|
||||
try:
|
||||
obj = parse_json_object(llm_call(_build_prompt(items)))
|
||||
except Exception as exc:
|
||||
return {
|
||||
"theme": "",
|
||||
"threads": [],
|
||||
}, {
|
||||
"input_count": len(items),
|
||||
"theme_present": False,
|
||||
"thread_count": 0,
|
||||
"dropped_thread_count": 0,
|
||||
"fallback_used": True,
|
||||
"errors": [f"{type(exc).__name__}: {exc}"],
|
||||
}
|
||||
|
||||
valid_ids = {item.id for item in items}
|
||||
threads: list[dict[str, Any]] = []
|
||||
dropped = 0
|
||||
for thread in obj.get("threads", []) or []:
|
||||
item_ids = [item_id for item_id in thread.get("item_ids", []) if item_id in valid_ids]
|
||||
if not item_ids:
|
||||
dropped += 1
|
||||
continue
|
||||
title = _clean_text(str(thread.get("title") or ""), limit=80)
|
||||
text = _clean_text(str(thread.get("text") or ""), limit=220)
|
||||
if not title or not text:
|
||||
dropped += 1
|
||||
continue
|
||||
kind = thread.get("kind") if thread.get("kind") in ("thread", "uncertain") else "thread"
|
||||
threads.append({"title": title, "text": text, "item_ids": item_ids, "kind": kind})
|
||||
|
||||
theme = _clean_text(str(obj.get("theme") or ""), limit=120)
|
||||
guide = {"theme": theme, "threads": threads}
|
||||
report = {
|
||||
"input_count": len(items),
|
||||
"theme_present": bool(theme),
|
||||
"thread_count": len(threads),
|
||||
"dropped_thread_count": dropped,
|
||||
"fallback_used": False,
|
||||
"errors": [],
|
||||
}
|
||||
return guide, report
|
||||
18
ai_daily_report/llm.py
Normal file
18
ai_daily_report/llm.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import Any, Callable
|
||||
|
||||
|
||||
LlmCall = Callable[[str], str]
|
||||
|
||||
|
||||
def parse_json_object(text: str) -> dict[str, Any]:
|
||||
text = re.sub(r"^```(?:json)?\s*\n?", "", text.strip())
|
||||
text = re.sub(r"\n?```\s*$", "", text)
|
||||
match = re.search(r"\{.*\}\s*$", text, re.S)
|
||||
if not match:
|
||||
raise ValueError("LLM output does not contain a JSON object")
|
||||
return json.loads(match.group(0))
|
||||
|
||||
53
ai_daily_report/models.py
Normal file
53
ai_daily_report/models.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SourceConfig:
|
||||
name: str
|
||||
type: str
|
||||
role: str = "supplement"
|
||||
priority: int = 100
|
||||
required: bool = False
|
||||
enabled: bool = True
|
||||
timeout_seconds: int = 25
|
||||
retries: int = 0
|
||||
min_items: int = 0
|
||||
url: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class SourceResult:
|
||||
source: str
|
||||
role: str
|
||||
ok: bool
|
||||
status: str
|
||||
items: list[dict[str, Any]] = field(default_factory=list)
|
||||
error: str | None = None
|
||||
elapsed_ms: int = 0
|
||||
retry_count: int = 0
|
||||
fetched_at: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class NewsItem:
|
||||
id: str
|
||||
source_group: str
|
||||
source_label: str
|
||||
source_role: str
|
||||
source_priority: int
|
||||
title_raw: str
|
||||
title_norm: str
|
||||
summary_raw: str
|
||||
url: str
|
||||
canonical_url: str
|
||||
published_at: str | None = None
|
||||
collected_at: str = ""
|
||||
origin_type: str = ""
|
||||
section_hint: str = ""
|
||||
language_hint: str = ""
|
||||
title: str | None = None
|
||||
summary: str | None = None
|
||||
section: str | None = None
|
||||
quality_flags: list[str] = field(default_factory=list)
|
||||
duplicate_sources: list[dict[str, Any]] = field(default_factory=list)
|
||||
132
ai_daily_report/normalize.py
Normal file
132
ai_daily_report/normalize.py
Normal file
@@ -0,0 +1,132 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import html
|
||||
import re
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
|
||||
|
||||
from .models import NewsItem, SourceResult
|
||||
|
||||
|
||||
TRACKING_QUERY_PREFIXES = ("utm_",)
|
||||
TRACKING_QUERY_KEYS = {"fbclid", "gclid", "spm", "from", "ref"}
|
||||
|
||||
|
||||
def clean_text(value: str) -> str:
|
||||
text = html.unescape(value or "")
|
||||
text = re.sub(r"<[^>]+>", " ", text)
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
|
||||
def canonicalize_url(url: str) -> str:
|
||||
if not url:
|
||||
return ""
|
||||
parsed = urlparse(url.strip())
|
||||
scheme = (parsed.scheme or "https").lower()
|
||||
host = (parsed.netloc or "").lower()
|
||||
if host.startswith("www."):
|
||||
host = host[4:]
|
||||
if host == "twitter.com":
|
||||
host = "x.com"
|
||||
|
||||
query = []
|
||||
for key, value in parse_qsl(parsed.query, keep_blank_values=True):
|
||||
key_lower = key.lower()
|
||||
if key_lower in TRACKING_QUERY_KEYS:
|
||||
continue
|
||||
if any(key_lower.startswith(prefix) for prefix in TRACKING_QUERY_PREFIXES):
|
||||
continue
|
||||
query.append((key, value))
|
||||
|
||||
path = parsed.path or ""
|
||||
if len(path) > 1:
|
||||
path = path.rstrip("/")
|
||||
|
||||
return urlunparse((scheme, host, path, "", urlencode(query), ""))
|
||||
|
||||
|
||||
def normalize_title(title: str) -> str:
|
||||
text = unicodedata.normalize("NFKC", title or "").lower()
|
||||
text = re.sub(r"[^\w\u4e00-\u9fff]+", "", text)
|
||||
return text
|
||||
|
||||
|
||||
def _item_id(canonical_url: str, source_group: str, title_norm: str, published_at: str | None) -> str:
|
||||
seed = canonical_url or "|".join([source_group, title_norm, published_at or ""])
|
||||
digest = hashlib.sha1(seed.encode("utf-8")).hexdigest()[:16]
|
||||
return f"item_{digest}"
|
||||
|
||||
|
||||
def _quality_flags(title: str, summary: str, url: str) -> list[str]:
|
||||
flags: list[str] = []
|
||||
if not url:
|
||||
flags.append("missing_url")
|
||||
if not summary:
|
||||
flags.append("missing_summary")
|
||||
if len(normalize_title(title)) < 3:
|
||||
flags.append("short_title")
|
||||
return flags
|
||||
|
||||
|
||||
def normalize_items(
|
||||
source_results: list[SourceResult],
|
||||
*,
|
||||
run_date: str,
|
||||
source_priorities: dict[str, int] | None = None,
|
||||
) -> tuple[list[NewsItem], dict[str, Any]]:
|
||||
source_priorities = source_priorities or {}
|
||||
collected_at = datetime.now(timezone.utc).isoformat()
|
||||
items: list[NewsItem] = []
|
||||
flag_counts: Counter[str] = Counter()
|
||||
id_counts: Counter[str] = Counter()
|
||||
input_count = 0
|
||||
|
||||
for source_result in source_results:
|
||||
for raw in source_result.items:
|
||||
input_count += 1
|
||||
title = clean_text(str(raw.get("title_raw") or raw.get("title") or ""))
|
||||
summary = clean_text(str(raw.get("summary_raw") or raw.get("summary") or ""))
|
||||
url = str(raw.get("url") or "").strip()
|
||||
canonical_url = canonicalize_url(url)
|
||||
title_norm = normalize_title(title)
|
||||
flags = _quality_flags(title, summary, canonical_url)
|
||||
flag_counts.update(flags)
|
||||
source_label = clean_text(str(raw.get("source_label") or source_result.source))
|
||||
published_at = raw.get("published_at")
|
||||
base_id = _item_id(canonical_url, source_result.source, title_norm, published_at)
|
||||
id_counts[base_id] += 1
|
||||
item_id = base_id if id_counts[base_id] == 1 else f"{base_id}_{id_counts[base_id]}"
|
||||
|
||||
items.append(
|
||||
NewsItem(
|
||||
id=item_id,
|
||||
source_group=source_result.source,
|
||||
source_label=source_label,
|
||||
source_role=source_result.role,
|
||||
source_priority=source_priorities.get(source_result.source, 100),
|
||||
title_raw=title,
|
||||
title_norm=title_norm,
|
||||
summary_raw=summary,
|
||||
url=url,
|
||||
canonical_url=canonical_url,
|
||||
published_at=published_at,
|
||||
collected_at=collected_at,
|
||||
origin_type=str(raw.get("origin_type") or ""),
|
||||
section_hint=str(raw.get("section_hint") or ""),
|
||||
language_hint=str(raw.get("language_hint") or ""),
|
||||
quality_flags=flags,
|
||||
)
|
||||
)
|
||||
|
||||
report = {
|
||||
"run_date": run_date,
|
||||
"input_count": input_count,
|
||||
"output_count": len(items),
|
||||
"quality_flag_counts": dict(flag_counts),
|
||||
}
|
||||
return items, report
|
||||
219
ai_daily_report/pipeline.py
Normal file
219
ai_daily_report/pipeline.py
Normal file
@@ -0,0 +1,219 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from .assemble import assemble_markdown
|
||||
from .classify import classify_and_order_items
|
||||
from .collect import Fetcher, collect_sources
|
||||
from .dedupe import hard_dedup_items
|
||||
from .guide import GuideLlmCall, generate_guide
|
||||
from .models import SourceConfig
|
||||
from .normalize import normalize_items
|
||||
from .publish import BlogClient, publish_markdown
|
||||
from .rewrite import RewriteLlmCall, rewrite_items
|
||||
from .semantic_dedupe import SemanticLlmCall, semantic_dedup_items
|
||||
|
||||
|
||||
def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig:
|
||||
return SourceConfig(
|
||||
name=value["name"],
|
||||
type=value["type"],
|
||||
role=value.get("role", "supplement"),
|
||||
priority=int(value.get("priority", 100)),
|
||||
required=bool(value.get("required", False)),
|
||||
enabled=bool(value.get("enabled", True)),
|
||||
timeout_seconds=int(value.get("timeout_seconds", 25)),
|
||||
retries=int(value.get("retries", 0)),
|
||||
min_items=int(value.get("min_items", 0)),
|
||||
url=value.get("url", ""),
|
||||
)
|
||||
|
||||
|
||||
def run_stage0_to_stage2(
|
||||
source_configs: list[dict[str, Any] | SourceConfig],
|
||||
run_date: str,
|
||||
*,
|
||||
fetcher: Fetcher,
|
||||
) -> dict[str, Any]:
|
||||
configs = [
|
||||
config if isinstance(config, SourceConfig) else _source_config_from_dict(config)
|
||||
for config in source_configs
|
||||
]
|
||||
source_results, stage0_report = collect_sources(configs, run_date, fetcher=fetcher)
|
||||
source_priorities = {config.name: config.priority for config in configs}
|
||||
normalized_items, stage1_report = normalize_items(
|
||||
source_results,
|
||||
run_date=run_date,
|
||||
source_priorities=source_priorities,
|
||||
)
|
||||
deduped_items, stage2_report = hard_dedup_items(normalized_items)
|
||||
return {
|
||||
"source_results": source_results,
|
||||
"items": deduped_items,
|
||||
"reports": {
|
||||
"stage0": stage0_report,
|
||||
"stage1": stage1_report,
|
||||
"stage2": stage2_report,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def run_stage0_to_stage4(
|
||||
source_configs: list[dict[str, Any] | SourceConfig],
|
||||
run_date: str,
|
||||
*,
|
||||
fetcher: Fetcher,
|
||||
semantic_llm_call: SemanticLlmCall,
|
||||
rewrite_llm_call: RewriteLlmCall,
|
||||
) -> dict[str, Any]:
|
||||
stage2_result = run_stage0_to_stage2(source_configs, run_date, fetcher=fetcher)
|
||||
items = stage2_result["items"]
|
||||
candidates = stage2_result["reports"]["stage2"].get("possible_duplicates", [])
|
||||
semantic_items, stage3_report = semantic_dedup_items(
|
||||
items,
|
||||
candidates,
|
||||
llm_call=semantic_llm_call,
|
||||
)
|
||||
rewritten_items, stage4_report = rewrite_items(
|
||||
semantic_items,
|
||||
llm_call=rewrite_llm_call,
|
||||
)
|
||||
reports = dict(stage2_result["reports"])
|
||||
reports["stage3"] = stage3_report
|
||||
reports["stage4"] = stage4_report
|
||||
return {
|
||||
"source_results": stage2_result["source_results"],
|
||||
"items": rewritten_items,
|
||||
"reports": reports,
|
||||
}
|
||||
|
||||
|
||||
def run_stage0_to_stage5(
|
||||
source_configs: list[dict[str, Any] | SourceConfig],
|
||||
run_date: str,
|
||||
*,
|
||||
fetcher: Fetcher,
|
||||
semantic_llm_call: SemanticLlmCall,
|
||||
rewrite_llm_call: RewriteLlmCall,
|
||||
) -> dict[str, Any]:
|
||||
stage4_result = run_stage0_to_stage4(
|
||||
source_configs,
|
||||
run_date,
|
||||
fetcher=fetcher,
|
||||
semantic_llm_call=semantic_llm_call,
|
||||
rewrite_llm_call=rewrite_llm_call,
|
||||
)
|
||||
classified_items, stage5_report = classify_and_order_items(stage4_result["items"])
|
||||
reports = dict(stage4_result["reports"])
|
||||
reports["stage5"] = stage5_report
|
||||
return {
|
||||
"source_results": stage4_result["source_results"],
|
||||
"items": classified_items,
|
||||
"reports": reports,
|
||||
}
|
||||
|
||||
|
||||
def run_stage0_to_stage6(
|
||||
source_configs: list[dict[str, Any] | SourceConfig],
|
||||
run_date: str,
|
||||
*,
|
||||
fetcher: Fetcher,
|
||||
semantic_llm_call: SemanticLlmCall,
|
||||
rewrite_llm_call: RewriteLlmCall,
|
||||
guide_llm_call: GuideLlmCall,
|
||||
) -> dict[str, Any]:
|
||||
stage5_result = run_stage0_to_stage5(
|
||||
source_configs,
|
||||
run_date,
|
||||
fetcher=fetcher,
|
||||
semantic_llm_call=semantic_llm_call,
|
||||
rewrite_llm_call=rewrite_llm_call,
|
||||
)
|
||||
guide, stage6_report = generate_guide(stage5_result["items"], llm_call=guide_llm_call)
|
||||
reports = dict(stage5_result["reports"])
|
||||
reports["stage6"] = stage6_report
|
||||
return {
|
||||
"source_results": stage5_result["source_results"],
|
||||
"items": stage5_result["items"],
|
||||
"guide": guide,
|
||||
"reports": reports,
|
||||
}
|
||||
|
||||
|
||||
def run_stage0_to_stage7(
|
||||
source_configs: list[dict[str, Any] | SourceConfig],
|
||||
run_date: str,
|
||||
*,
|
||||
fetcher: Fetcher,
|
||||
semantic_llm_call: SemanticLlmCall,
|
||||
rewrite_llm_call: RewriteLlmCall,
|
||||
guide_llm_call: GuideLlmCall,
|
||||
) -> dict[str, Any]:
|
||||
stage6_result = run_stage0_to_stage6(
|
||||
source_configs,
|
||||
run_date,
|
||||
fetcher=fetcher,
|
||||
semantic_llm_call=semantic_llm_call,
|
||||
rewrite_llm_call=rewrite_llm_call,
|
||||
guide_llm_call=guide_llm_call,
|
||||
)
|
||||
markdown, stage7_report = assemble_markdown(stage6_result["items"], stage6_result["guide"])
|
||||
reports = dict(stage6_result["reports"])
|
||||
reports["stage7"] = stage7_report
|
||||
return {
|
||||
"source_results": stage6_result["source_results"],
|
||||
"items": stage6_result["items"],
|
||||
"guide": stage6_result["guide"],
|
||||
"markdown": markdown,
|
||||
"reports": reports,
|
||||
}
|
||||
|
||||
|
||||
def run_stage0_to_stage8(
|
||||
source_configs: list[dict[str, Any] | SourceConfig],
|
||||
run_date: str,
|
||||
*,
|
||||
fetcher: Fetcher,
|
||||
semantic_llm_call: SemanticLlmCall,
|
||||
rewrite_llm_call: RewriteLlmCall,
|
||||
guide_llm_call: GuideLlmCall,
|
||||
mode: str,
|
||||
base_url: str,
|
||||
client: BlogClient | None,
|
||||
) -> dict[str, Any]:
|
||||
stage7_result = run_stage0_to_stage7(
|
||||
source_configs,
|
||||
run_date,
|
||||
fetcher=fetcher,
|
||||
semantic_llm_call=semantic_llm_call,
|
||||
rewrite_llm_call=rewrite_llm_call,
|
||||
guide_llm_call=guide_llm_call,
|
||||
)
|
||||
slug = f"ai-{run_date}"
|
||||
publish_result = publish_markdown(
|
||||
title=f"AI日报 · {run_date}",
|
||||
markdown=stage7_result["markdown"],
|
||||
tags=["AI日报", "AI资讯", "人工智能"],
|
||||
slug=slug,
|
||||
base_url=base_url,
|
||||
mode=mode,
|
||||
markdown_report=stage7_result["reports"]["stage7"],
|
||||
client=client,
|
||||
)
|
||||
reports = dict(stage7_result["reports"])
|
||||
reports["stage8"] = {
|
||||
"mode": publish_result.mode,
|
||||
"status": publish_result.status,
|
||||
"slug": publish_result.slug,
|
||||
"blog_url": publish_result.blog_url,
|
||||
"public_ok": publish_result.public_ok,
|
||||
"error": publish_result.error,
|
||||
}
|
||||
return {
|
||||
"source_results": stage7_result["source_results"],
|
||||
"items": stage7_result["items"],
|
||||
"guide": stage7_result["guide"],
|
||||
"markdown": stage7_result["markdown"],
|
||||
"publish": publish_result,
|
||||
"reports": reports,
|
||||
}
|
||||
90
ai_daily_report/publish.py
Normal file
90
ai_daily_report/publish.py
Normal file
@@ -0,0 +1,90 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Protocol
|
||||
|
||||
|
||||
@dataclass
|
||||
class PublishResult:
|
||||
mode: str
|
||||
status: str
|
||||
slug: str
|
||||
blog_url: str
|
||||
public_ok: bool = False
|
||||
error: str | None = None
|
||||
|
||||
|
||||
class BlogClient(Protocol):
|
||||
def create_post(self, payload: dict[str, Any]) -> dict[str, Any]:
|
||||
...
|
||||
|
||||
def publish_post(self, slug: str) -> None:
|
||||
...
|
||||
|
||||
|
||||
def dry_run_publish(slug: str, base_url: str) -> PublishResult:
|
||||
return PublishResult(
|
||||
mode="dry-run",
|
||||
status="ok",
|
||||
slug=slug,
|
||||
blog_url=f"{base_url.rstrip('/')}/posts/{slug}",
|
||||
public_ok=True,
|
||||
)
|
||||
|
||||
|
||||
def publish_markdown(
|
||||
*,
|
||||
title: str,
|
||||
markdown: str,
|
||||
tags: list[str],
|
||||
slug: str,
|
||||
base_url: str,
|
||||
mode: str,
|
||||
markdown_report: dict[str, Any],
|
||||
client: BlogClient | None,
|
||||
) -> PublishResult:
|
||||
blocking_errors = markdown_report.get("blocking_errors", []) or []
|
||||
blog_url = f"{base_url.rstrip('/')}/posts/{slug}"
|
||||
if blocking_errors:
|
||||
return PublishResult(
|
||||
mode=mode,
|
||||
status="blocked",
|
||||
slug=slug,
|
||||
blog_url=blog_url,
|
||||
public_ok=False,
|
||||
error=";".join(blocking_errors),
|
||||
)
|
||||
if mode == "dry-run":
|
||||
return dry_run_publish(slug, base_url)
|
||||
if client is None:
|
||||
return PublishResult(
|
||||
mode=mode,
|
||||
status="failed",
|
||||
slug=slug,
|
||||
blog_url=blog_url,
|
||||
public_ok=False,
|
||||
error="missing_blog_client",
|
||||
)
|
||||
|
||||
payload = {"title": title, "content": markdown, "tags": tags, "slug": slug}
|
||||
try:
|
||||
create_resp = client.create_post(payload)
|
||||
created_slug = create_resp.get("slug") or slug
|
||||
if mode == "publish":
|
||||
client.publish_post(created_slug)
|
||||
return PublishResult(
|
||||
mode=mode,
|
||||
status="ok",
|
||||
slug=created_slug,
|
||||
blog_url=f"{base_url.rstrip('/')}/posts/{created_slug}",
|
||||
public_ok=mode == "publish",
|
||||
)
|
||||
except Exception as exc:
|
||||
return PublishResult(
|
||||
mode=mode,
|
||||
status="failed",
|
||||
slug=slug,
|
||||
blog_url=blog_url,
|
||||
public_ok=False,
|
||||
error=f"{type(exc).__name__}: {exc}",
|
||||
)
|
||||
103
ai_daily_report/rewrite.py
Normal file
103
ai_daily_report/rewrite.py
Normal file
@@ -0,0 +1,103 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any, Callable
|
||||
|
||||
from .llm import parse_json_object
|
||||
from .models import NewsItem
|
||||
|
||||
|
||||
RewriteLlmCall = Callable[[str], str]
|
||||
|
||||
|
||||
def _chunks(items: list[NewsItem], size: int) -> list[list[NewsItem]]:
|
||||
return [items[index : index + size] for index in range(0, len(items), size)]
|
||||
|
||||
|
||||
def _build_prompt(batch: list[NewsItem]) -> str:
|
||||
payload = {
|
||||
"task": (
|
||||
"Rewrite AI news titles and summaries into concise Chinese. Preserve brand/model/API names "
|
||||
"such as GPT-5, Codex, Gemini, Claude, API, MCP. Do not add facts."
|
||||
),
|
||||
"items": [
|
||||
{
|
||||
"id": item.id,
|
||||
"title_raw": item.title_raw,
|
||||
"summary_raw": item.summary_raw,
|
||||
"source": item.source_label,
|
||||
"language_hint": item.language_hint,
|
||||
}
|
||||
for item in batch
|
||||
],
|
||||
"output_schema": {
|
||||
"rewrites": [
|
||||
{
|
||||
"id": "item id",
|
||||
"title": "display title",
|
||||
"summary": "display summary",
|
||||
"flags": [],
|
||||
}
|
||||
]
|
||||
},
|
||||
}
|
||||
return json.dumps(payload, ensure_ascii=False)
|
||||
|
||||
|
||||
def _fallback(item: NewsItem) -> None:
|
||||
item.title = item.title_raw
|
||||
item.summary = item.summary_raw or "该条目暂无摘要。"
|
||||
|
||||
|
||||
def _apply_rewrite_batch(batch: list[NewsItem], llm_call: RewriteLlmCall) -> int:
|
||||
obj = parse_json_object(llm_call(_build_prompt(batch)))
|
||||
rewrites = obj.get("rewrites", [])
|
||||
if not isinstance(rewrites, list):
|
||||
raise ValueError("rewrites is not a list")
|
||||
by_id = {item.id: item for item in batch}
|
||||
seen_ids: set[str] = set()
|
||||
for entry in rewrites:
|
||||
item_id = entry.get("id")
|
||||
title = str(entry.get("title") or "").strip()
|
||||
summary = str(entry.get("summary") or "").strip()
|
||||
if item_id in by_id and title and summary:
|
||||
by_id[item_id].title = title
|
||||
by_id[item_id].summary = summary
|
||||
seen_ids.add(item_id)
|
||||
for item in batch:
|
||||
if item.id not in seen_ids:
|
||||
raise ValueError(f"missing_rewrite_for_item: {item.id}")
|
||||
return len(seen_ids)
|
||||
|
||||
|
||||
def rewrite_items(
|
||||
items: list[NewsItem],
|
||||
*,
|
||||
llm_call: RewriteLlmCall,
|
||||
batch_size: int = 10,
|
||||
) -> tuple[list[NewsItem], dict[str, Any]]:
|
||||
rewritten_count = 0
|
||||
fallback_count = 0
|
||||
errors: list[str] = []
|
||||
|
||||
for batch in _chunks(items, max(1, batch_size)):
|
||||
try:
|
||||
rewritten_count += _apply_rewrite_batch(batch, llm_call)
|
||||
except Exception as exc:
|
||||
errors.append(f"batch:{type(exc).__name__}: {exc}")
|
||||
for item in batch:
|
||||
try:
|
||||
rewritten_count += _apply_rewrite_batch([item], llm_call)
|
||||
except Exception as item_exc:
|
||||
errors.append(f"item:{item.id}:{type(item_exc).__name__}: {item_exc}")
|
||||
_fallback(item)
|
||||
fallback_count += 1
|
||||
|
||||
report = {
|
||||
"input_count": len(items),
|
||||
"rewritten_count": rewritten_count,
|
||||
"fallback_count": fallback_count,
|
||||
"batch_count": len(_chunks(items, max(1, batch_size))),
|
||||
"errors": errors,
|
||||
}
|
||||
return items, report
|
||||
156
ai_daily_report/runner.py
Normal file
156
ai_daily_report/runner.py
Normal file
@@ -0,0 +1,156 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import asdict, is_dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .clients import BlogApiClient, OpenAICompatibleClient, fetch_text as default_fetch_text
|
||||
from .config import load_source_configs
|
||||
from .env import load_env, resolve_blog_token, resolve_llm_config
|
||||
from .models import SourceConfig
|
||||
from .pipeline import run_stage0_to_stage8
|
||||
from .sources.registry import get_source_fetcher
|
||||
|
||||
|
||||
def _json_default(value: Any):
|
||||
if is_dataclass(value):
|
||||
return asdict(value)
|
||||
raise TypeError(f"Object is not JSON serializable: {type(value).__name__}")
|
||||
|
||||
|
||||
def _mock_source_configs() -> list[SourceConfig]:
|
||||
return [SourceConfig(name="Mock AI HOT", type="mock", role="primary", priority=10)]
|
||||
|
||||
|
||||
def _mock_fetcher(config: SourceConfig, run_date: str) -> list[dict[str, Any]]:
|
||||
return [
|
||||
{
|
||||
"title_raw": "GPT-5 API 发布",
|
||||
"summary_raw": "OpenAI 发布 GPT-5 API,用于本地 mock 测试。",
|
||||
"url": "https://example.com/gpt5",
|
||||
"source_label": "OpenAI:Blog",
|
||||
"section_hint": "模型发布/更新",
|
||||
"origin_type": "mock",
|
||||
"language_hint": "zh",
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _mock_semantic_llm(prompt: str) -> str:
|
||||
return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []}, ensure_ascii=False)
|
||||
|
||||
|
||||
def _mock_rewrite_llm(prompt: str) -> str:
|
||||
payload = json.loads(prompt)
|
||||
return json.dumps(
|
||||
{
|
||||
"rewrites": [
|
||||
{
|
||||
"id": item["id"],
|
||||
"title": item["title_raw"],
|
||||
"summary": item["summary_raw"],
|
||||
"flags": [],
|
||||
}
|
||||
for item in payload["items"]
|
||||
]
|
||||
},
|
||||
ensure_ascii=False,
|
||||
)
|
||||
|
||||
|
||||
def _mock_guide_llm(prompt: str) -> str:
|
||||
payload = json.loads(prompt)
|
||||
item_ids = [item["id"] for item in payload["items"][:3]]
|
||||
return json.dumps(
|
||||
{
|
||||
"theme": "本地 mock 模式已生成 AI 日报,用于验证流水线。",
|
||||
"threads": [
|
||||
{
|
||||
"title": "本地链路验证",
|
||||
"text": "采集、改写、分类、导览、Markdown 和发布报告都已通过 mock 数据串联。",
|
||||
"item_ids": item_ids,
|
||||
"kind": "thread",
|
||||
}
|
||||
],
|
||||
},
|
||||
ensure_ascii=False,
|
||||
)
|
||||
|
||||
|
||||
def run_daily_report(
|
||||
*,
|
||||
run_date: str,
|
||||
mode: str,
|
||||
source_mode: str,
|
||||
llm_mode: str,
|
||||
out_dir: Path,
|
||||
base_url: str,
|
||||
sources_path: Path | None = None,
|
||||
fetch_text=None,
|
||||
env: dict[str, str] | None = None,
|
||||
llm_client_factory=OpenAICompatibleClient,
|
||||
blog_client_factory=BlogApiClient,
|
||||
) -> dict[str, Any]:
|
||||
fetch_text = fetch_text or default_fetch_text
|
||||
env = env if env is not None else load_env()
|
||||
|
||||
if source_mode == "mock":
|
||||
source_configs = _mock_source_configs()
|
||||
fetcher = _mock_fetcher
|
||||
elif source_mode == "live":
|
||||
if sources_path is None:
|
||||
sources_path = Path("config") / "sources.json"
|
||||
source_configs = load_source_configs(sources_path)
|
||||
|
||||
def fetcher(config: SourceConfig, current_date: str) -> list[dict[str, Any]]:
|
||||
source_fetcher = get_source_fetcher(config.type)
|
||||
return source_fetcher(config, current_date, fetch_text)
|
||||
|
||||
else:
|
||||
raise ValueError("source_mode must be 'mock' or 'live'")
|
||||
|
||||
if llm_mode == "mock":
|
||||
semantic_llm_call = _mock_semantic_llm
|
||||
rewrite_llm_call = _mock_rewrite_llm
|
||||
guide_llm_call = _mock_guide_llm
|
||||
elif llm_mode == "live":
|
||||
llm_client = llm_client_factory(**resolve_llm_config(env))
|
||||
semantic_llm_call = llm_client.chat
|
||||
rewrite_llm_call = llm_client.chat
|
||||
guide_llm_call = llm_client.chat
|
||||
else:
|
||||
raise ValueError("llm_mode must be 'mock' or 'live'")
|
||||
|
||||
blog_client = None
|
||||
if mode in ("draft", "publish"):
|
||||
token = resolve_blog_token(env)
|
||||
if not token:
|
||||
raise ValueError("missing_blog_token: set BLOG_SERVICE_TOKEN or EPHRON_SERVICE_TOKEN")
|
||||
blog_client = blog_client_factory(base_url=base_url, token=token)
|
||||
|
||||
result = run_stage0_to_stage8(
|
||||
source_configs,
|
||||
run_date,
|
||||
fetcher=fetcher,
|
||||
semantic_llm_call=semantic_llm_call,
|
||||
rewrite_llm_call=rewrite_llm_call,
|
||||
guide_llm_call=guide_llm_call,
|
||||
mode=mode,
|
||||
base_url=base_url,
|
||||
client=blog_client,
|
||||
)
|
||||
|
||||
run_dir = out_dir / run_date
|
||||
run_dir.mkdir(parents=True, exist_ok=True)
|
||||
(run_dir / "blog_markdown.md").write_text(result["markdown"], encoding="utf-8")
|
||||
(run_dir / "run_report.json").write_text(
|
||||
json.dumps(result["reports"], ensure_ascii=False, indent=2, default=_json_default),
|
||||
encoding="utf-8",
|
||||
)
|
||||
return {
|
||||
"run_dir": str(run_dir),
|
||||
"markdown": result["markdown"],
|
||||
"reports": result["reports"],
|
||||
"publish": result["publish"],
|
||||
}
|
||||
167
ai_daily_report/semantic_dedupe.py
Normal file
167
ai_daily_report/semantic_dedupe.py
Normal file
@@ -0,0 +1,167 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any, Callable
|
||||
|
||||
from .llm import parse_json_object
|
||||
from .models import NewsItem
|
||||
|
||||
|
||||
SemanticLlmCall = Callable[[str], str]
|
||||
|
||||
|
||||
def _build_prompt(items: list[NewsItem], candidates: list[dict[str, Any]]) -> str:
|
||||
item_payload = [
|
||||
{
|
||||
"id": item.id,
|
||||
"title": item.title or item.title_raw,
|
||||
"summary": item.summary or item.summary_raw,
|
||||
"source": item.source_label,
|
||||
"section_hint": item.section_hint,
|
||||
}
|
||||
for item in items
|
||||
]
|
||||
prompt = {
|
||||
"task": "Identify only high-confidence semantic duplicates. Do not curate or remove by importance.",
|
||||
"items": item_payload,
|
||||
"candidates": candidates,
|
||||
"output_schema": {
|
||||
"duplicate_groups": [
|
||||
{
|
||||
"keep_id": "item id",
|
||||
"remove_ids": ["item id"],
|
||||
"confidence": "high|medium|low",
|
||||
"reason": "same concrete event reason",
|
||||
}
|
||||
],
|
||||
"not_duplicates": [],
|
||||
"uncertain": [],
|
||||
},
|
||||
}
|
||||
return json.dumps(prompt, ensure_ascii=False)
|
||||
|
||||
|
||||
def _score(item: NewsItem) -> int:
|
||||
score = max(0, 200 - item.source_priority)
|
||||
if item.source_role == "primary":
|
||||
score += 10
|
||||
if item.summary_raw:
|
||||
score += min(40, len(item.summary_raw))
|
||||
if item.canonical_url:
|
||||
score += 20
|
||||
score -= len(item.quality_flags) * 10
|
||||
return score
|
||||
|
||||
|
||||
def _choose_keep(group_items: list[NewsItem], suggested_keep_id: str) -> NewsItem:
|
||||
suggested = [item for item in group_items if item.id == suggested_keep_id]
|
||||
if suggested:
|
||||
best = max(group_items, key=_score)
|
||||
if _score(suggested[0]) >= _score(best) - 10:
|
||||
return suggested[0]
|
||||
return max(group_items, key=_score)
|
||||
|
||||
|
||||
def semantic_dedup_items(
|
||||
items: list[NewsItem],
|
||||
candidates: list[dict[str, Any]],
|
||||
*,
|
||||
llm_call: SemanticLlmCall,
|
||||
max_deletion_ratio: float = 0.5,
|
||||
) -> tuple[list[NewsItem], dict[str, Any]]:
|
||||
if not items or not candidates:
|
||||
return items, {
|
||||
"input_count": len(items),
|
||||
"candidate_group_count": len(candidates),
|
||||
"removed_count": 0,
|
||||
"duplicate_groups": [],
|
||||
"uncertain": [],
|
||||
"errors": [],
|
||||
"skipped_for_deletion_ratio": False,
|
||||
}
|
||||
|
||||
errors: list[str] = []
|
||||
try:
|
||||
obj = parse_json_object(llm_call(_build_prompt(items, candidates)))
|
||||
except Exception as exc:
|
||||
return items, {
|
||||
"input_count": len(items),
|
||||
"candidate_group_count": len(candidates),
|
||||
"removed_count": 0,
|
||||
"duplicate_groups": [],
|
||||
"uncertain": [],
|
||||
"errors": [f"{type(exc).__name__}: {exc}"],
|
||||
"skipped_for_deletion_ratio": False,
|
||||
}
|
||||
|
||||
by_id = {item.id: item for item in items}
|
||||
candidate_sets = {
|
||||
frozenset(item_id for item_id in candidate.get("item_ids", []) if isinstance(item_id, str))
|
||||
for candidate in candidates
|
||||
}
|
||||
candidate_removals: set[str] = set()
|
||||
valid_groups: list[dict[str, Any]] = []
|
||||
|
||||
for group in obj.get("duplicate_groups", []) or []:
|
||||
if group.get("confidence") != "high":
|
||||
continue
|
||||
ids = [group.get("keep_id")] + list(group.get("remove_ids") or [])
|
||||
if any(not isinstance(item_id, str) or item_id not in by_id for item_id in ids):
|
||||
errors.append(f"invalid_ids_in_group: {group}")
|
||||
continue
|
||||
group_set = frozenset(ids)
|
||||
if not any(group_set.issubset(candidate_set) for candidate_set in candidate_sets):
|
||||
errors.append(f"group_outside_candidates: {group}")
|
||||
continue
|
||||
group_items = [by_id[item_id] for item_id in ids]
|
||||
keep = _choose_keep(group_items, str(group.get("keep_id")))
|
||||
remove_items = [item for item in group_items if item is not keep]
|
||||
candidate_removals.update(item.id for item in remove_items)
|
||||
valid_groups.append(
|
||||
{
|
||||
"keep_id": keep.id,
|
||||
"remove_ids": [item.id for item in remove_items],
|
||||
"confidence": "high",
|
||||
"reason": str(group.get("reason") or "semantic_duplicate"),
|
||||
}
|
||||
)
|
||||
|
||||
deletion_ratio = len(candidate_removals) / len(items) if items else 0
|
||||
if deletion_ratio > max_deletion_ratio:
|
||||
return items, {
|
||||
"input_count": len(items),
|
||||
"candidate_group_count": len(candidates),
|
||||
"removed_count": 0,
|
||||
"duplicate_groups": valid_groups,
|
||||
"uncertain": obj.get("uncertain", []) or [],
|
||||
"errors": errors,
|
||||
"skipped_for_deletion_ratio": True,
|
||||
}
|
||||
|
||||
removed_ids: set[str] = set()
|
||||
for group in valid_groups:
|
||||
keep = by_id[group["keep_id"]]
|
||||
for remove_id in group["remove_ids"]:
|
||||
removed = by_id[remove_id]
|
||||
keep.duplicate_sources.append(
|
||||
{
|
||||
"id": removed.id,
|
||||
"source_group": removed.source_group,
|
||||
"source_label": removed.source_label,
|
||||
"url": removed.url,
|
||||
"reason": group["reason"],
|
||||
}
|
||||
)
|
||||
removed_ids.add(remove_id)
|
||||
|
||||
deduped = [item for item in items if item.id not in removed_ids]
|
||||
report = {
|
||||
"input_count": len(items),
|
||||
"candidate_group_count": len(candidates),
|
||||
"removed_count": len(removed_ids),
|
||||
"duplicate_groups": valid_groups,
|
||||
"uncertain": obj.get("uncertain", []) or [],
|
||||
"errors": errors,
|
||||
"skipped_for_deletion_ratio": False,
|
||||
}
|
||||
return deduped, report
|
||||
2
ai_daily_report/sources/__init__.py
Normal file
2
ai_daily_report/sources/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
"""Source adapters for the AI daily report pipeline."""
|
||||
|
||||
32
ai_daily_report/sources/aihot.py
Normal file
32
ai_daily_report/sources/aihot.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any, Callable
|
||||
|
||||
from ai_daily_report.models import SourceConfig
|
||||
|
||||
|
||||
FetchText = Callable[[str, int], str]
|
||||
|
||||
|
||||
def fetch_aihot(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
|
||||
data = json.loads(fetch_text(f"https://aihot.virxact.com/api/public/daily/{run_date}", config.timeout_seconds))
|
||||
items: list[dict[str, Any]] = []
|
||||
generated = data.get("generatedAt")
|
||||
for section in data.get("sections", []) or []:
|
||||
for raw in section.get("items", []) or []:
|
||||
items.append(
|
||||
{
|
||||
"source_group": config.name,
|
||||
"source_label": raw.get("sourceName") or config.name,
|
||||
"title_raw": raw.get("title") or "",
|
||||
"summary_raw": raw.get("summary") or "",
|
||||
"url": raw.get("sourceUrl") or "",
|
||||
"published_at": generated,
|
||||
"origin_type": "aihot_json",
|
||||
"section_hint": section.get("label") or "",
|
||||
"language_hint": "zh",
|
||||
}
|
||||
)
|
||||
return items
|
||||
|
||||
58
ai_daily_report/sources/juya.py
Normal file
58
ai_daily_report/sources/juya.py
Normal file
@@ -0,0 +1,58 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
from typing import Any, Callable
|
||||
|
||||
from ai_daily_report.models import SourceConfig
|
||||
from ai_daily_report.normalize import clean_text
|
||||
from ai_daily_report.sources.labels import source_label_from_url
|
||||
|
||||
|
||||
FetchText = Callable[[str, int], str]
|
||||
|
||||
|
||||
def parse_juya_rss(config: SourceConfig, xml_text: str, run_date: str) -> list[dict[str, Any]]:
|
||||
root = ET.fromstring(xml_text)
|
||||
channel = root.find("channel")
|
||||
raw_items = channel.findall("item") if channel is not None else []
|
||||
article_html = ""
|
||||
for raw in raw_items:
|
||||
if (raw.findtext("title") or "").strip() != run_date:
|
||||
continue
|
||||
content_el = raw.find("{http://purl.org/rss/1.0/modules/content/}encoded")
|
||||
article_html = content_el.text if content_el is not None and content_el.text else ""
|
||||
break
|
||||
if not article_html:
|
||||
return []
|
||||
|
||||
block_pattern = re.compile(
|
||||
r'<h2[^>]*>\s*(?:<a[^>]*href="(?P<title_url>[^"]+)"[^>]*>)?(?P<title_html>[^<]*?)</a>?\s*<code>#(?P<num>\d+)</code>\s*</h2>(?P<body>.*?)(?=<hr\s*/?>\s*<h2|<p><strong>提示</strong>|$)',
|
||||
re.S | re.I,
|
||||
)
|
||||
items: list[dict[str, Any]] = []
|
||||
for match in block_pattern.finditer(article_html):
|
||||
title = clean_text(match.group("title_html") or "")
|
||||
body_html = match.group("body") or ""
|
||||
links = re.findall(r'<a[^>]*href="([^"]+)"[^>]*>', body_html, re.I)
|
||||
url = links[0].replace("&", "&").strip() if links else (match.group("title_url") or "")
|
||||
summary = clean_text(re.sub(r"<[^>]+>", " ", body_html))
|
||||
if title:
|
||||
items.append(
|
||||
{
|
||||
"source_group": config.name,
|
||||
"source_label": source_label_from_url(url, fallback=config.name),
|
||||
"title_raw": title,
|
||||
"summary_raw": summary[:500],
|
||||
"url": url,
|
||||
"published_at": None,
|
||||
"origin_type": "juya_issue",
|
||||
"section_hint": "",
|
||||
"language_hint": "zh",
|
||||
}
|
||||
)
|
||||
return items
|
||||
|
||||
|
||||
def fetch_juya(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
|
||||
return parse_juya_rss(config, fetch_text(config.url, config.timeout_seconds), run_date)
|
||||
78
ai_daily_report/sources/labels.py
Normal file
78
ai_daily_report/sources/labels.py
Normal file
@@ -0,0 +1,78 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
DOMAIN_LABELS = {
|
||||
"anthropic.com": "Anthropic",
|
||||
"arxiv.org": "arXiv",
|
||||
"bloomberg.com": "Bloomberg",
|
||||
"deepseek.com": "DeepSeek",
|
||||
"github.blog": "GitHub Blog",
|
||||
"github.com": "GitHub",
|
||||
"huggingface.co": "Hugging Face",
|
||||
"infoq.com": "InfoQ",
|
||||
"mp.weixin.qq.com": "微信公众号",
|
||||
"openai.com": "OpenAI",
|
||||
"platform.minimaxi.com": "MiniMax:Docs",
|
||||
"qbitai.com": "量子位",
|
||||
"techcrunch.com": "TechCrunch",
|
||||
"technologyreview.com": "MIT科技评论AI",
|
||||
"theverge.com": "The Verge",
|
||||
"x.com": "X",
|
||||
"twitter.com": "X",
|
||||
}
|
||||
|
||||
X_DISPLAY_NAMES = {
|
||||
"MiniMax_AI": "MiniMax",
|
||||
"OpenAIDevs": "OpenAI Developers",
|
||||
"openai": "OpenAI",
|
||||
"openclaw": "OpenClaw",
|
||||
"xai": "xAI",
|
||||
"krea_ai": "Krea AI",
|
||||
"nvidia": "NVIDIA",
|
||||
"NVIDIAAI": "NVIDIA AI",
|
||||
"alibaba_cloud": "阿里云 / Alibaba Cloud",
|
||||
"cb_doge": "cb_doge",
|
||||
}
|
||||
|
||||
|
||||
def _host(url: str) -> str:
|
||||
host = (urlparse(url).netloc or "").lower()
|
||||
return host[4:] if host.startswith("www.") else host
|
||||
|
||||
|
||||
def _domain_label(host: str) -> str:
|
||||
for domain, label in DOMAIN_LABELS.items():
|
||||
if host == domain or host.endswith("." + domain):
|
||||
return label
|
||||
return host
|
||||
|
||||
|
||||
def _x_handle(url: str) -> str:
|
||||
parts = [part for part in urlparse(url).path.split("/") if part]
|
||||
if not parts:
|
||||
return ""
|
||||
handle = parts[0]
|
||||
if handle in {"i", "search", "explore", "settings", "notifications", "home", "compose"}:
|
||||
return ""
|
||||
return handle
|
||||
|
||||
|
||||
def source_label_from_url(url: str, *, fallback: str = "来源") -> str:
|
||||
if not url:
|
||||
return fallback
|
||||
host = _host(url)
|
||||
if host in {"x.com", "twitter.com"}:
|
||||
handle = _x_handle(url)
|
||||
if handle:
|
||||
display = X_DISPLAY_NAMES.get(handle, handle)
|
||||
return f"X:{display} (@{handle})"
|
||||
return "X"
|
||||
|
||||
label = _domain_label(host)
|
||||
parsed = urlparse(url)
|
||||
path = (parsed.path or "").lower()
|
||||
if label and ("blog" in host or "/blog" in path or "/research" in path):
|
||||
return f"{label}:Blog"
|
||||
return label or fallback
|
||||
24
ai_daily_report/sources/registry.py
Normal file
24
ai_daily_report/sources/registry.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Callable
|
||||
|
||||
from ai_daily_report.models import SourceConfig
|
||||
from ai_daily_report.sources.aihot import fetch_aihot
|
||||
from ai_daily_report.sources.juya import fetch_juya
|
||||
from ai_daily_report.sources.rss import fetch_rss
|
||||
|
||||
|
||||
SourceFetcher = Callable[[SourceConfig, str, Callable[[str, int], str]], list[dict]]
|
||||
|
||||
SOURCE_FETCHERS: dict[str, SourceFetcher] = {
|
||||
"aihot": fetch_aihot,
|
||||
"rss": fetch_rss,
|
||||
"juya_rss": fetch_juya,
|
||||
}
|
||||
|
||||
|
||||
def get_source_fetcher(source_type: str) -> SourceFetcher:
|
||||
if source_type not in SOURCE_FETCHERS:
|
||||
raise KeyError(f"Unknown source type: {source_type}")
|
||||
return SOURCE_FETCHERS[source_type]
|
||||
|
||||
51
ai_daily_report/sources/rss.py
Normal file
51
ai_daily_report/sources/rss.py
Normal file
@@ -0,0 +1,51 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import xml.etree.ElementTree as ET
|
||||
from email.utils import parsedate_to_datetime
|
||||
from typing import Any, Callable
|
||||
|
||||
from ai_daily_report.models import SourceConfig
|
||||
from ai_daily_report.normalize import clean_text
|
||||
|
||||
|
||||
FetchText = Callable[[str, int], str]
|
||||
|
||||
|
||||
def _parse_pubdate(value: str) -> str | None:
|
||||
if not value:
|
||||
return None
|
||||
try:
|
||||
return parsedate_to_datetime(value).isoformat()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def parse_rss_items(config: SourceConfig, xml_text: str, *, limit: int = 20) -> list[dict[str, Any]]:
|
||||
root = ET.fromstring(xml_text)
|
||||
channel = root.find("channel")
|
||||
raw_items = channel.findall("item") if channel is not None else []
|
||||
items: list[dict[str, Any]] = []
|
||||
for raw in raw_items[:limit]:
|
||||
title = clean_text(raw.findtext("title") or "")
|
||||
if not title:
|
||||
continue
|
||||
summary = clean_text(raw.findtext("description") or "")
|
||||
items.append(
|
||||
{
|
||||
"source_group": config.name,
|
||||
"source_label": config.name,
|
||||
"title_raw": title,
|
||||
"summary_raw": summary,
|
||||
"url": (raw.findtext("link") or "").strip(),
|
||||
"published_at": _parse_pubdate(raw.findtext("pubDate") or ""),
|
||||
"origin_type": "rss",
|
||||
"section_hint": "",
|
||||
"language_hint": "en" if title.encode("utf-8").isascii() else "zh",
|
||||
}
|
||||
)
|
||||
return items
|
||||
|
||||
|
||||
def fetch_rss(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
|
||||
return parse_rss_items(config, fetch_text(config.url, config.timeout_seconds))
|
||||
|
||||
46
ai_daily_report/validate.py
Normal file
46
ai_daily_report/validate.py
Normal file
@@ -0,0 +1,46 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from .classify import SECTION_ORDER
|
||||
from .models import NewsItem
|
||||
|
||||
|
||||
def validate_report_markdown(markdown: str, items: list[NewsItem]) -> dict[str, Any]:
|
||||
return validate_markdown(markdown, items)
|
||||
|
||||
|
||||
def validate_markdown(markdown: str, items: list[NewsItem]) -> dict[str, Any]:
|
||||
blocking_errors: list[str] = []
|
||||
auto_fixes: list[str] = []
|
||||
warnings: list[dict[str, str]] = []
|
||||
|
||||
if not items:
|
||||
blocking_errors.append("no_items")
|
||||
if len((markdown or "").strip()) < 80:
|
||||
blocking_errors.append("markdown_too_short")
|
||||
if items and "## " not in markdown:
|
||||
blocking_errors.append("no_sections")
|
||||
if re.search(r"\{[^{}]*\}", markdown or ""):
|
||||
blocking_errors.append("json_fragment_detected")
|
||||
if "> >" in (markdown or ""):
|
||||
auto_fixes.append("double_blockquote_detected")
|
||||
if re.search(r"\[\d+\]|\[N\]", markdown or ""):
|
||||
auto_fixes.append("reference_marker_detected")
|
||||
|
||||
for item in items:
|
||||
if not item.url:
|
||||
warnings.append({"type": "missing_url", "item_id": item.id})
|
||||
if item.section not in SECTION_ORDER:
|
||||
blocking_errors.append("invalid_section")
|
||||
break
|
||||
|
||||
return {
|
||||
"item_count": len(items),
|
||||
"section_count": len({item.section for item in items if item.section}),
|
||||
"markdown_length": len(markdown or ""),
|
||||
"auto_fixes": auto_fixes,
|
||||
"warnings": warnings,
|
||||
"blocking_errors": blocking_errors,
|
||||
}
|
||||
Reference in New Issue
Block a user