from __future__ import annotations import json from typing import Any, Callable from urllib.error import HTTPError from .classify import SECTION_ORDER from .llm import parse_json_object from .models import NewsItem RewriteLlmCall = Callable[[str], str] def _chunks(items: list[NewsItem], size: int) -> list[list[NewsItem]]: return [items[index : index + size] for index in range(0, len(items), size)] def _build_prompt(batch: list[NewsItem]) -> str: payload = { "task": ( "For each AI news item, translate when needed, rewrite the title and summary into concise Chinese, " "and classify it into exactly one allowed section. Preserve brand/model/API names such as GPT-5, " "Codex, Gemini, Claude, API, MCP. Do not add facts." ), "allowed_sections": SECTION_ORDER, "section_guidance": { "模型与能力": "model releases, capability upgrades, modalities, context windows, inference, benchmarks tied to model ability", "产品与应用": "end-user products, apps, agents, workflows, product launches, practical business or consumer use cases", "开发与基础设施": "developer tools, APIs, SDKs, MCP, frameworks, deployment, chips, cloud, infra, open source engineering", "公司与资本": "company strategy, financing, IPO, acquisitions, partnerships, revenue, business competition", "政策与安全": "policy, regulation, safety, privacy, copyright, misuse, security incidents, governance", "论文与研究": "papers, academic research, arXiv, methods, experiments, datasets, evaluations", "观点与教程": "opinions, analysis, explainers, tutorials, guides, practices", "人物与动态": "people-focused interviews, speeches, career moves, public appearances", }, "items": [ { "id": item.id, "title_raw": item.title_raw, "summary_raw": item.summary_raw, "source": item.source_label, "language_hint": item.language_hint, "source_section_hint": item.section_hint, } for item in batch ], "output_schema": { "rewrites": [ { "id": "item id", "title": "display title", "summary": "display summary", "section": "one allowed section", "confidence": 0.0, "flags": [], } ] }, } return json.dumps(payload, ensure_ascii=False) def _fallback(item: NewsItem) -> None: item.title = item.title_raw item.summary = item.summary_raw or "该条目暂无摘要。" def _is_transient_llm_error(exc: Exception) -> bool: if isinstance(exc, TimeoutError): return True if isinstance(exc, HTTPError): return exc.code in {429, 500, 502, 503, 504} return False def _apply_rewrite_batch(batch: list[NewsItem], llm_call: RewriteLlmCall) -> tuple[int, int]: obj = parse_json_object(llm_call(_build_prompt(batch))) rewrites = obj.get("rewrites", []) if not isinstance(rewrites, list): raise ValueError("rewrites is not a list") by_id = {item.id: item for item in batch} seen_ids: set[str] = set() section_count = 0 for entry in rewrites: item_id = entry.get("id") title = str(entry.get("title") or "").strip() summary = str(entry.get("summary") or "").strip() if item_id in by_id and title and summary: by_id[item_id].title = title by_id[item_id].summary = summary section = str(entry.get("section") or "").strip() if section in SECTION_ORDER: by_id[item_id].section = section section_count += 1 seen_ids.add(item_id) return len(seen_ids), section_count def rewrite_items( items: list[NewsItem], *, llm_call: RewriteLlmCall, batch_size: int = 30, max_fallback_ratio: float = 0.2, retry_single_items: bool = False, ) -> tuple[list[NewsItem], dict[str, Any]]: rewritten_count = 0 llm_section_count = 0 fallback_count = 0 missing_rewrite_count = 0 errors: list[str] = [] for batch in _chunks(items, max(1, batch_size)): try: batch_rewritten_count, batch_section_count = _apply_rewrite_batch(batch, llm_call) rewritten_count += batch_rewritten_count llm_section_count += batch_section_count for item in batch: if item.title is None or item.summary is None: errors.append(f"missing_rewrite_for_item: {item.id}") _fallback(item) fallback_count += 1 missing_rewrite_count += 1 except Exception as exc: errors.append(f"batch:{type(exc).__name__}: {exc}") if _is_transient_llm_error(exc): for item in batch: _fallback(item) fallback_count += 1 continue if not retry_single_items: for item in batch: _fallback(item) fallback_count += 1 continue for item in batch: try: item_rewritten_count, item_section_count = _apply_rewrite_batch([item], llm_call) rewritten_count += item_rewritten_count llm_section_count += item_section_count except Exception as item_exc: errors.append(f"item:{item.id}:{type(item_exc).__name__}: {item_exc}") _fallback(item) fallback_count += 1 fallback_ratio = fallback_count / len(items) if items else 0 blocking_errors: list[str] = [] if fallback_ratio > max_fallback_ratio: blocking_errors.append("rewrite_fallback_ratio_exceeded") report = { "input_count": len(items), "rewritten_count": rewritten_count, "llm_section_count": llm_section_count, "fallback_count": fallback_count, "missing_rewrite_count": missing_rewrite_count, "fallback_ratio": round(fallback_ratio, 4), "batch_count": len(_chunks(items, max(1, batch_size))), "errors": errors, "blocking_errors": blocking_errors, "quality_gate_failed": bool(blocking_errors), } return items, report