Reduce LLM rewrite calls and add report intro conclusion
This commit is contained in:
@@ -35,13 +35,28 @@ def _source_link(item: NewsItem) -> str:
|
|||||||
return source
|
return source
|
||||||
|
|
||||||
|
|
||||||
|
def _fallback_intro(items: list[NewsItem]) -> str:
|
||||||
|
count = len(items)
|
||||||
|
return f"今天共聚合 {count} 条 AI 动态,覆盖模型能力、产品应用、基础设施、资本与治理等方向。"
|
||||||
|
|
||||||
|
|
||||||
|
def _fallback_conclusion(items: list[NewsItem]) -> str:
|
||||||
|
sections = [section for section in SECTION_ORDER if any(item.section == section for item in items)]
|
||||||
|
if sections:
|
||||||
|
return "总体看,今日 AI 动态主要集中在" + "、".join(sections[:4]) + "等方向,后续仍需持续观察落地进展。"
|
||||||
|
return "总体看,今日 AI 动态仍在持续演进,后续需要关注产品落地和生态变化。"
|
||||||
|
|
||||||
|
|
||||||
def assemble_markdown(items: list[NewsItem], guide: dict[str, Any] | None = None) -> tuple[str, dict[str, Any]]:
|
def assemble_markdown(items: list[NewsItem], guide: dict[str, Any] | None = None) -> tuple[str, dict[str, Any]]:
|
||||||
guide = guide or {"theme": "", "threads": []}
|
guide = guide or {"intro": "", "theme": "", "threads": [], "conclusion": ""}
|
||||||
lines: list[str] = []
|
lines: list[str] = []
|
||||||
|
|
||||||
|
intro = _ensure_sentence(str(guide.get("intro") or "")) or _fallback_intro(items)
|
||||||
|
lines.extend(["## 引言", "", f"> {intro}", ""])
|
||||||
|
|
||||||
theme = _clean_text(str(guide.get("theme") or ""))
|
theme = _clean_text(str(guide.get("theme") or ""))
|
||||||
if theme:
|
if theme:
|
||||||
lines.extend(["## 导览", "", f"> {theme}", ""])
|
lines.extend(["## 导览", "", f"> {_ensure_sentence(theme)}", ""])
|
||||||
|
|
||||||
item_number = 1
|
item_number = 1
|
||||||
for section in SECTION_ORDER:
|
for section in SECTION_ORDER:
|
||||||
@@ -72,6 +87,9 @@ def assemble_markdown(items: list[NewsItem], guide: dict[str, Any] | None = None
|
|||||||
continue
|
continue
|
||||||
lines.extend([f"- **{title}**", f" {text}", ""])
|
lines.extend([f"- **{title}**", f" {text}", ""])
|
||||||
|
|
||||||
|
conclusion = _ensure_sentence(str(guide.get("conclusion") or "")) or _fallback_conclusion(items)
|
||||||
|
lines.extend(["## 总结", "", f"> {conclusion}", ""])
|
||||||
|
|
||||||
markdown = "\n".join(lines).strip()
|
markdown = "\n".join(lines).strip()
|
||||||
report = validate_markdown(markdown, items)
|
report = validate_markdown(markdown, items)
|
||||||
return markdown, report
|
return markdown, report
|
||||||
|
|||||||
@@ -23,8 +23,10 @@ def _clean_text(text: str, limit: int | None = None) -> str:
|
|||||||
def _build_prompt(items: list[NewsItem]) -> str:
|
def _build_prompt(items: list[NewsItem]) -> str:
|
||||||
payload = {
|
payload = {
|
||||||
"task": (
|
"task": (
|
||||||
"Generate a concise AI daily report guide. Return JSON only. Do not use 强信号/中信号/待验证. "
|
"Generate a concise Chinese AI daily report guide. Return JSON only. "
|
||||||
"Use a short theme and 2-4 daily threads. Every thread must reference existing item_ids."
|
"Do not use 强信号/中信号/待验证. Do not add facts. "
|
||||||
|
"Write one opening intro, a short theme, 2-4 daily threads, and one closing conclusion. "
|
||||||
|
"Every thread must reference existing item_ids."
|
||||||
),
|
),
|
||||||
"items": [
|
"items": [
|
||||||
{
|
{
|
||||||
@@ -37,6 +39,7 @@ def _build_prompt(items: list[NewsItem]) -> str:
|
|||||||
for item in items
|
for item in items
|
||||||
],
|
],
|
||||||
"output_schema": {
|
"output_schema": {
|
||||||
|
"intro": "one opening paragraph under 160 Chinese characters",
|
||||||
"theme": "one sentence under 120 Chinese characters",
|
"theme": "one sentence under 120 Chinese characters",
|
||||||
"threads": [
|
"threads": [
|
||||||
{
|
{
|
||||||
@@ -46,23 +49,27 @@ def _build_prompt(items: list[NewsItem]) -> str:
|
|||||||
"kind": "thread|uncertain",
|
"kind": "thread|uncertain",
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
"conclusion": "one closing paragraph under 180 Chinese characters",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
return json.dumps(payload, ensure_ascii=False)
|
return json.dumps(payload, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
|
def _empty_guide() -> dict[str, Any]:
|
||||||
|
return {"intro": "", "theme": "", "threads": [], "conclusion": ""}
|
||||||
|
|
||||||
|
|
||||||
def generate_guide(
|
def generate_guide(
|
||||||
items: list[NewsItem],
|
items: list[NewsItem],
|
||||||
*,
|
*,
|
||||||
llm_call: GuideLlmCall,
|
llm_call: GuideLlmCall,
|
||||||
) -> tuple[dict[str, Any], dict[str, Any]]:
|
) -> tuple[dict[str, Any], dict[str, Any]]:
|
||||||
if not items:
|
if not items:
|
||||||
return {
|
return _empty_guide(), {
|
||||||
"theme": "",
|
|
||||||
"threads": [],
|
|
||||||
}, {
|
|
||||||
"input_count": 0,
|
"input_count": 0,
|
||||||
|
"intro_present": False,
|
||||||
"theme_present": False,
|
"theme_present": False,
|
||||||
|
"conclusion_present": False,
|
||||||
"thread_count": 0,
|
"thread_count": 0,
|
||||||
"dropped_thread_count": 0,
|
"dropped_thread_count": 0,
|
||||||
"fallback_used": False,
|
"fallback_used": False,
|
||||||
@@ -72,12 +79,11 @@ def generate_guide(
|
|||||||
try:
|
try:
|
||||||
obj = parse_json_object(llm_call(_build_prompt(items)))
|
obj = parse_json_object(llm_call(_build_prompt(items)))
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
return {
|
return _empty_guide(), {
|
||||||
"theme": "",
|
|
||||||
"threads": [],
|
|
||||||
}, {
|
|
||||||
"input_count": len(items),
|
"input_count": len(items),
|
||||||
|
"intro_present": False,
|
||||||
"theme_present": False,
|
"theme_present": False,
|
||||||
|
"conclusion_present": False,
|
||||||
"thread_count": 0,
|
"thread_count": 0,
|
||||||
"dropped_thread_count": 0,
|
"dropped_thread_count": 0,
|
||||||
"fallback_used": True,
|
"fallback_used": True,
|
||||||
@@ -100,11 +106,15 @@ def generate_guide(
|
|||||||
kind = thread.get("kind") if thread.get("kind") in ("thread", "uncertain") else "thread"
|
kind = thread.get("kind") if thread.get("kind") in ("thread", "uncertain") else "thread"
|
||||||
threads.append({"title": title, "text": text, "item_ids": item_ids, "kind": kind})
|
threads.append({"title": title, "text": text, "item_ids": item_ids, "kind": kind})
|
||||||
|
|
||||||
|
intro = _clean_text(str(obj.get("intro") or ""), limit=160)
|
||||||
theme = _clean_text(str(obj.get("theme") or ""), limit=120)
|
theme = _clean_text(str(obj.get("theme") or ""), limit=120)
|
||||||
guide = {"theme": theme, "threads": threads}
|
conclusion = _clean_text(str(obj.get("conclusion") or ""), limit=180)
|
||||||
|
guide = {"intro": intro, "theme": theme, "threads": threads, "conclusion": conclusion}
|
||||||
report = {
|
report = {
|
||||||
"input_count": len(items),
|
"input_count": len(items),
|
||||||
|
"intro_present": bool(intro),
|
||||||
"theme_present": bool(theme),
|
"theme_present": bool(theme),
|
||||||
|
"conclusion_present": bool(conclusion),
|
||||||
"thread_count": len(threads),
|
"thread_count": len(threads),
|
||||||
"dropped_thread_count": dropped,
|
"dropped_thread_count": dropped,
|
||||||
"fallback_used": False,
|
"fallback_used": False,
|
||||||
|
|||||||
@@ -83,8 +83,9 @@ def rewrite_items(
|
|||||||
items: list[NewsItem],
|
items: list[NewsItem],
|
||||||
*,
|
*,
|
||||||
llm_call: RewriteLlmCall,
|
llm_call: RewriteLlmCall,
|
||||||
batch_size: int = 10,
|
batch_size: int = 30,
|
||||||
max_fallback_ratio: float = 0.2,
|
max_fallback_ratio: float = 0.2,
|
||||||
|
retry_single_items: bool = False,
|
||||||
) -> tuple[list[NewsItem], dict[str, Any]]:
|
) -> tuple[list[NewsItem], dict[str, Any]]:
|
||||||
rewritten_count = 0
|
rewritten_count = 0
|
||||||
fallback_count = 0
|
fallback_count = 0
|
||||||
@@ -100,6 +101,11 @@ def rewrite_items(
|
|||||||
_fallback(item)
|
_fallback(item)
|
||||||
fallback_count += 1
|
fallback_count += 1
|
||||||
continue
|
continue
|
||||||
|
if not retry_single_items:
|
||||||
|
for item in batch:
|
||||||
|
_fallback(item)
|
||||||
|
fallback_count += 1
|
||||||
|
continue
|
||||||
for item in batch:
|
for item in batch:
|
||||||
try:
|
try:
|
||||||
rewritten_count += _apply_rewrite_batch([item], llm_call)
|
rewritten_count += _apply_rewrite_batch([item], llm_call)
|
||||||
|
|||||||
@@ -64,6 +64,7 @@ def _mock_guide_llm(prompt: str) -> str:
|
|||||||
item_ids = [item["id"] for item in payload["items"][:3]]
|
item_ids = [item["id"] for item in payload["items"][:3]]
|
||||||
return json.dumps(
|
return json.dumps(
|
||||||
{
|
{
|
||||||
|
"intro": "本地 mock 模式已生成 AI 日报,用于验证流水线。",
|
||||||
"theme": "本地 mock 模式已生成 AI 日报,用于验证流水线。",
|
"theme": "本地 mock 模式已生成 AI 日报,用于验证流水线。",
|
||||||
"threads": [
|
"threads": [
|
||||||
{
|
{
|
||||||
@@ -73,6 +74,7 @@ def _mock_guide_llm(prompt: str) -> str:
|
|||||||
"kind": "thread",
|
"kind": "thread",
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
"conclusion": "本地 mock 结果可用于确认定时任务入口和文件输出是否正常。",
|
||||||
},
|
},
|
||||||
ensure_ascii=False,
|
ensure_ascii=False,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ class Stage4RewriteTests(unittest.TestCase):
|
|||||||
self.assertEqual(report["fallback_count"], 1)
|
self.assertEqual(report["fallback_count"], 1)
|
||||||
self.assertIn("TimeoutError", report["errors"][0])
|
self.assertIn("TimeoutError", report["errors"][0])
|
||||||
|
|
||||||
def test_rewrite_items_retries_failed_batch_as_single_items(self):
|
def test_rewrite_items_can_retry_failed_batch_as_single_items_when_enabled(self):
|
||||||
items = [news_item("a"), news_item("b")]
|
items = [news_item("a"), news_item("b")]
|
||||||
calls = []
|
calls = []
|
||||||
|
|
||||||
@@ -85,13 +85,53 @@ class Stage4RewriteTests(unittest.TestCase):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
rewritten, report = rewrite_items(items, llm_call=llm_call, batch_size=2)
|
rewritten, report = rewrite_items(items, llm_call=llm_call, batch_size=2, retry_single_items=True)
|
||||||
|
|
||||||
self.assertEqual([item.title for item in rewritten], ["title a", "title b"])
|
self.assertEqual([item.title for item in rewritten], ["title a", "title b"])
|
||||||
self.assertEqual(report["rewritten_count"], 2)
|
self.assertEqual(report["rewritten_count"], 2)
|
||||||
self.assertEqual(report["fallback_count"], 0)
|
self.assertEqual(report["fallback_count"], 0)
|
||||||
self.assertEqual(calls, [["a", "b"], ["a"], ["b"]])
|
self.assertEqual(calls, [["a", "b"], ["a"], ["b"]])
|
||||||
|
|
||||||
|
def test_rewrite_items_does_not_retry_single_items_by_default(self):
|
||||||
|
items = [news_item("a"), news_item("b")]
|
||||||
|
calls = []
|
||||||
|
|
||||||
|
def llm_call(prompt):
|
||||||
|
payload = json.loads(prompt)
|
||||||
|
calls.append([item["id"] for item in payload["items"]])
|
||||||
|
return "not json"
|
||||||
|
|
||||||
|
rewritten, report = rewrite_items(items, llm_call=llm_call, batch_size=2)
|
||||||
|
|
||||||
|
self.assertEqual(calls, [["a", "b"]])
|
||||||
|
self.assertEqual([item.title for item in rewritten], ["OpenAI launches GPT-5 API", "OpenAI launches GPT-5 API"])
|
||||||
|
self.assertEqual(report["fallback_count"], 2)
|
||||||
|
|
||||||
|
def test_rewrite_items_defaults_to_large_batches_to_reduce_llm_requests(self):
|
||||||
|
items = [news_item(str(index)) for index in range(61)]
|
||||||
|
batch_sizes = []
|
||||||
|
|
||||||
|
def llm_call(prompt):
|
||||||
|
payload = json.loads(prompt)
|
||||||
|
batch_sizes.append(len(payload["items"]))
|
||||||
|
return json.dumps(
|
||||||
|
{
|
||||||
|
"rewrites": [
|
||||||
|
{
|
||||||
|
"id": entry["id"],
|
||||||
|
"title": entry["title_raw"],
|
||||||
|
"summary": entry["summary_raw"],
|
||||||
|
"flags": [],
|
||||||
|
}
|
||||||
|
for entry in payload["items"]
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
rewrite_items(items, llm_call=llm_call)
|
||||||
|
|
||||||
|
self.assertEqual(batch_sizes, [30, 30, 1])
|
||||||
|
|
||||||
def test_rewrite_items_does_not_retry_single_items_after_transient_http_error(self):
|
def test_rewrite_items_does_not_retry_single_items_after_transient_http_error(self):
|
||||||
items = [news_item("a"), news_item("b")]
|
items = [news_item("a"), news_item("b")]
|
||||||
calls = 0
|
calls = 0
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ def news_item(item_id, title, section="模型与能力"):
|
|||||||
|
|
||||||
|
|
||||||
class Stage6GuideTests(unittest.TestCase):
|
class Stage6GuideTests(unittest.TestCase):
|
||||||
def test_generate_guide_returns_theme_and_valid_threads(self):
|
def test_generate_guide_returns_intro_theme_threads_and_conclusion(self):
|
||||||
items = [
|
items = [
|
||||||
news_item("a", "GPT-5 API 发布"),
|
news_item("a", "GPT-5 API 发布"),
|
||||||
news_item("b", "Miso One 开源语音模型"),
|
news_item("b", "Miso One 开源语音模型"),
|
||||||
@@ -33,6 +33,7 @@ class Stage6GuideTests(unittest.TestCase):
|
|||||||
def llm_call(prompt):
|
def llm_call(prompt):
|
||||||
return json.dumps(
|
return json.dumps(
|
||||||
{
|
{
|
||||||
|
"intro": "今天的 AI 行业继续围绕模型能力、Agent 产品和基础设施演进展开。",
|
||||||
"theme": "模型能力继续向 API 和实时语音两端推进。",
|
"theme": "模型能力继续向 API 和实时语音两端推进。",
|
||||||
"threads": [
|
"threads": [
|
||||||
{
|
{
|
||||||
@@ -48,13 +49,16 @@ class Stage6GuideTests(unittest.TestCase):
|
|||||||
"kind": "thread",
|
"kind": "thread",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
"conclusion": "总体看,模型能力正在进入更多产品入口,生态竞争也在继续加速。",
|
||||||
},
|
},
|
||||||
ensure_ascii=False,
|
ensure_ascii=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
guide, report = generate_guide(items, llm_call=llm_call)
|
guide, report = generate_guide(items, llm_call=llm_call)
|
||||||
|
|
||||||
|
self.assertEqual(guide["intro"], "今天的 AI 行业继续围绕模型能力、Agent 产品和基础设施演进展开。")
|
||||||
self.assertEqual(guide["theme"], "模型能力继续向 API 和实时语音两端推进。")
|
self.assertEqual(guide["theme"], "模型能力继续向 API 和实时语音两端推进。")
|
||||||
|
self.assertEqual(guide["conclusion"], "总体看,模型能力正在进入更多产品入口,生态竞争也在继续加速。")
|
||||||
self.assertEqual(len(guide["threads"]), 1)
|
self.assertEqual(len(guide["threads"]), 1)
|
||||||
self.assertEqual(guide["threads"][0]["item_ids"], ["a", "b"])
|
self.assertEqual(guide["threads"][0]["item_ids"], ["a", "b"])
|
||||||
self.assertEqual(report["dropped_thread_count"], 1)
|
self.assertEqual(report["dropped_thread_count"], 1)
|
||||||
@@ -67,7 +71,9 @@ class Stage6GuideTests(unittest.TestCase):
|
|||||||
|
|
||||||
guide, report = generate_guide(items, llm_call=llm_call)
|
guide, report = generate_guide(items, llm_call=llm_call)
|
||||||
|
|
||||||
|
self.assertEqual(guide["intro"], "")
|
||||||
self.assertEqual(guide["theme"], "")
|
self.assertEqual(guide["theme"], "")
|
||||||
|
self.assertEqual(guide["conclusion"], "")
|
||||||
self.assertEqual(guide["threads"], [])
|
self.assertEqual(guide["threads"], [])
|
||||||
self.assertTrue(report["fallback_used"])
|
self.assertTrue(report["fallback_used"])
|
||||||
self.assertIn("TimeoutError", report["errors"][0])
|
self.assertIn("TimeoutError", report["errors"][0])
|
||||||
|
|||||||
@@ -23,12 +23,13 @@ def news_item(item_id, title, section):
|
|||||||
|
|
||||||
|
|
||||||
class Stage7AssembleTests(unittest.TestCase):
|
class Stage7AssembleTests(unittest.TestCase):
|
||||||
def test_assemble_markdown_renders_sections_and_daily_threads(self):
|
def test_assemble_markdown_renders_intro_sections_daily_threads_and_conclusion(self):
|
||||||
items = [
|
items = [
|
||||||
news_item("a", "GPT-5 API 发布", "模型与能力"),
|
news_item("a", "GPT-5 API 发布", "模型与能力"),
|
||||||
news_item("b", "Anthropic 提交 IPO 文件", "公司与资本"),
|
news_item("b", "Anthropic 提交 IPO 文件", "公司与资本"),
|
||||||
]
|
]
|
||||||
guide = {
|
guide = {
|
||||||
|
"intro": "今天的 AI 行业继续围绕模型、产品和资本展开。",
|
||||||
"theme": "> 模型和资本两条线都在推进。[1]",
|
"theme": "> 模型和资本两条线都在推进。[1]",
|
||||||
"threads": [
|
"threads": [
|
||||||
{
|
{
|
||||||
@@ -38,10 +39,12 @@ class Stage7AssembleTests(unittest.TestCase):
|
|||||||
"kind": "thread",
|
"kind": "thread",
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
"conclusion": "总体看,AI 竞争继续从单点模型能力转向产品、基础设施和资本协同。",
|
||||||
}
|
}
|
||||||
|
|
||||||
md, report = assemble_markdown(items, guide)
|
md, report = assemble_markdown(items, guide)
|
||||||
|
|
||||||
|
self.assertTrue(md.startswith("## 引言\n\n> 今天的 AI 行业继续围绕模型、产品和资本展开。"))
|
||||||
self.assertIn("## 导览", md)
|
self.assertIn("## 导览", md)
|
||||||
self.assertIn("> 模型和资本两条线都在推进。", md)
|
self.assertIn("> 模型和资本两条线都在推进。", md)
|
||||||
self.assertIn("## 模型与能力", md)
|
self.assertIn("## 模型与能力", md)
|
||||||
@@ -49,6 +52,7 @@ class Stage7AssembleTests(unittest.TestCase):
|
|||||||
self.assertIn("**2. Anthropic 提交 IPO 文件**", md)
|
self.assertIn("**2. Anthropic 提交 IPO 文件**", md)
|
||||||
self.assertIn("## 今日脉络", md)
|
self.assertIn("## 今日脉络", md)
|
||||||
self.assertIn("- **模型能力产品化**", md)
|
self.assertIn("- **模型能力产品化**", md)
|
||||||
|
self.assertTrue(md.endswith("## 总结\n\n> 总体看,AI 竞争继续从单点模型能力转向产品、基础设施和资本协同。"))
|
||||||
self.assertNotIn("> >", md)
|
self.assertNotIn("> >", md)
|
||||||
self.assertNotIn("[1]", md)
|
self.assertNotIn("[1]", md)
|
||||||
self.assertEqual(report["item_count"], 2)
|
self.assertEqual(report["item_count"], 2)
|
||||||
|
|||||||
Reference in New Issue
Block a user