Files
ai-daily-report/ai_daily_report/assemble.py

96 lines
3.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import re
from typing import Any
from .classify import SECTION_ORDER
from .models import NewsItem
from .validate import validate_markdown
END_PUNCTUATION = "。!?;.!?;"
def _clean_text(text: str) -> str:
value = re.sub(r"^```(?:\w+)?\s*\n?", "", (text or "").strip())
value = re.sub(r"\n?```\s*$", "", value)
value = re.sub(r"^\s*>\s*", "", value)
value = re.sub(r"\[\d+\]|\[N\]", "", value)
value = re.sub(r"主线判断[:]\s*", "", value)
value = re.sub(r"\s+", " ", value).strip()
return value
def _ensure_sentence(text: str) -> str:
value = _clean_text(text)
if value and value[-1] not in END_PUNCTUATION:
value += ""
return value
def _source_link(item: NewsItem) -> str:
source = item.source_label or item.source_group or "来源"
if item.url:
return f"[{source} ↗]({item.url})"
return source
def _fallback_intro(items: list[NewsItem]) -> str:
count = len(items)
return f"今天共聚合 {count} 条 AI 动态,覆盖模型能力、产品应用、基础设施、资本与治理等方向。"
def _fallback_conclusion(items: list[NewsItem]) -> str:
sections = [section for section in SECTION_ORDER if any(item.section == section for item in items)]
if sections:
return "总体看,今日 AI 动态主要集中在" + "".join(sections[:4]) + "等方向,后续仍需持续观察落地进展。"
return "总体看,今日 AI 动态仍在持续演进,后续需要关注产品落地和生态变化。"
def assemble_markdown(items: list[NewsItem], guide: dict[str, Any] | None = None) -> tuple[str, dict[str, Any]]:
guide = guide or {"intro": "", "theme": "", "threads": [], "conclusion": ""}
lines: list[str] = []
intro = _ensure_sentence(str(guide.get("intro") or "")) or _fallback_intro(items)
lines.extend(["## 引言", "", f"> {intro}", ""])
theme = _clean_text(str(guide.get("theme") or ""))
if theme:
lines.extend(["## 导览", "", f"> {_ensure_sentence(theme)}", ""])
item_number = 1
for section in SECTION_ORDER:
section_items = [item for item in items if item.section == section]
if not section_items:
continue
lines.extend([f"## {section}", ""])
for item in section_items:
title = _clean_text(item.title or item.title_raw)
summary = _ensure_sentence(item.summary or item.summary_raw or "该条目暂无摘要。")
lines.extend(
[
f"**{item_number}. {title}**",
"",
f"> {summary}{_source_link(item)}",
"",
]
)
item_number += 1
threads = guide.get("threads", []) or []
if threads:
lines.extend(["## 今日脉络", ""])
for thread in threads:
title = _clean_text(str(thread.get("title") or ""))
text = _ensure_sentence(str(thread.get("text") or ""))
if not title or not text:
continue
lines.extend([f"- **{title}**", f" {text}", ""])
conclusion = _ensure_sentence(str(guide.get("conclusion") or "")) or _fallback_conclusion(items)
lines.extend(["## 总结", "", f"> {conclusion}", ""])
markdown = "\n".join(lines).strip()
report = validate_markdown(markdown, items)
return markdown, report