Add Stage 2.8 recall, quality gate, retries, and publish idempotency
This commit is contained in:
162
ai_daily_report/candidate_recall.py
Normal file
162
ai_daily_report/candidate_recall.py
Normal file
@@ -0,0 +1,162 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import difflib
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from typing import Any
|
||||
|
||||
from .dedupe import _jaccard_similarity, _title_tokens
|
||||
from .models import NewsItem
|
||||
|
||||
|
||||
DEFAULT_CONFIG = {
|
||||
"enabled": True,
|
||||
"max_pairs": 80,
|
||||
"max_pairs_per_item": 5,
|
||||
"title_similarity_threshold": 0.45,
|
||||
"title_jaccard_threshold": 0.25,
|
||||
"summary_jaccard_threshold": 0.18,
|
||||
"strong_entity_overlap_threshold": 2,
|
||||
}
|
||||
|
||||
STOP_ENTITIES = {
|
||||
"AI",
|
||||
"API",
|
||||
"CLI",
|
||||
"LLM",
|
||||
"Open Source",
|
||||
"GitHub",
|
||||
"Google",
|
||||
"OpenAI",
|
||||
"Anthropic",
|
||||
"Microsoft",
|
||||
"Meta",
|
||||
"Amazon",
|
||||
"NVIDIA",
|
||||
}
|
||||
|
||||
|
||||
def _config_value(config: dict[str, Any], name: str):
|
||||
return (config or {}).get(name, DEFAULT_CONFIG[name])
|
||||
|
||||
|
||||
def _text_tokens(value: str) -> set[str]:
|
||||
return _title_tokens(value)
|
||||
|
||||
|
||||
def _entity_tokens(value: str) -> set[str]:
|
||||
text = value or ""
|
||||
entities = set(re.findall(r"\b[A-Z][A-Za-z0-9]*(?:[- ][A-Z0-9][A-Za-z0-9]*)*\b", text))
|
||||
entities.update(re.findall(r"[\u4e00-\u9fffA-Za-z0-9]*[A-Za-z]+[0-9]+[A-Za-z0-9-]*", text))
|
||||
cleaned = {entity.strip() for entity in entities if len(entity.strip()) >= 3}
|
||||
return {entity for entity in cleaned if entity not in STOP_ENTITIES}
|
||||
|
||||
|
||||
def _pair_key(item_ids: list[str]) -> frozenset[str]:
|
||||
return frozenset(item_ids)
|
||||
|
||||
|
||||
def _candidate_score(left: NewsItem, right: NewsItem, config: dict[str, Any]) -> tuple[float, str, dict[str, Any]] | None:
|
||||
title_ratio = difflib.SequenceMatcher(None, left.title_norm, right.title_norm).ratio()
|
||||
title_jaccard = _jaccard_similarity(_text_tokens(left.title_norm), _text_tokens(right.title_norm))
|
||||
summary_jaccard = _jaccard_similarity(_text_tokens(left.summary_raw), _text_tokens(right.summary_raw))
|
||||
left_entities = _entity_tokens(f"{left.title_raw} {left.summary_raw}")
|
||||
right_entities = _entity_tokens(f"{right.title_raw} {right.summary_raw}")
|
||||
shared_entities = sorted(left_entities & right_entities)
|
||||
strong_entity_threshold = int(_config_value(config, "strong_entity_overlap_threshold"))
|
||||
|
||||
if len(shared_entities) >= strong_entity_threshold and summary_jaccard > 0:
|
||||
score = min(1.0, 0.55 + len(shared_entities) * 0.1 + summary_jaccard * 0.35)
|
||||
return score, "strong_entity_overlap", {
|
||||
"shared_entities": shared_entities,
|
||||
"title_similarity": round(title_ratio, 3),
|
||||
"title_jaccard": round(title_jaccard, 3),
|
||||
"summary_jaccard": round(summary_jaccard, 3),
|
||||
}
|
||||
|
||||
if title_ratio >= float(_config_value(config, "title_similarity_threshold")) and (
|
||||
title_jaccard >= float(_config_value(config, "title_jaccard_threshold"))
|
||||
or summary_jaccard >= float(_config_value(config, "summary_jaccard_threshold")) * 2
|
||||
or shared_entities
|
||||
):
|
||||
return title_ratio, "title_similarity", {
|
||||
"title_similarity": round(title_ratio, 3),
|
||||
"title_jaccard": round(title_jaccard, 3),
|
||||
"summary_jaccard": round(summary_jaccard, 3),
|
||||
}
|
||||
|
||||
if (
|
||||
title_jaccard >= float(_config_value(config, "title_jaccard_threshold"))
|
||||
and summary_jaccard >= float(_config_value(config, "summary_jaccard_threshold"))
|
||||
):
|
||||
score = (title_jaccard + summary_jaccard) / 2
|
||||
return score, "title_summary_jaccard", {
|
||||
"title_similarity": round(title_ratio, 3),
|
||||
"title_jaccard": round(title_jaccard, 3),
|
||||
"summary_jaccard": round(summary_jaccard, 3),
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def recall_semantic_candidates(
|
||||
items: list[NewsItem],
|
||||
*,
|
||||
existing_candidates: list[dict[str, Any]] | None = None,
|
||||
config: dict[str, Any] | None = None,
|
||||
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
|
||||
config = {**DEFAULT_CONFIG, **(config or {})}
|
||||
existing_candidates = list(existing_candidates or [])
|
||||
if not bool(config.get("enabled", True)):
|
||||
return existing_candidates, {
|
||||
"enabled": False,
|
||||
"input_count": len(items),
|
||||
"existing_candidate_group_count": len(existing_candidates),
|
||||
"added_candidate_group_count": 0,
|
||||
"candidate_group_count": len(existing_candidates),
|
||||
"candidates": existing_candidates,
|
||||
}
|
||||
|
||||
existing_keys = {_pair_key(list(candidate.get("item_ids", []) or [])) for candidate in existing_candidates}
|
||||
pair_counts: defaultdict[str, int] = defaultdict(int)
|
||||
recalled: list[dict[str, Any]] = []
|
||||
|
||||
for index, left in enumerate(items):
|
||||
for right in items[index + 1 :]:
|
||||
if pair_counts[left.id] >= int(config["max_pairs_per_item"]):
|
||||
continue
|
||||
if pair_counts[right.id] >= int(config["max_pairs_per_item"]):
|
||||
continue
|
||||
key = frozenset({left.id, right.id})
|
||||
if key in existing_keys:
|
||||
continue
|
||||
scored = _candidate_score(left, right, config)
|
||||
if scored is None:
|
||||
continue
|
||||
score, reason, evidence = scored
|
||||
recalled.append(
|
||||
{
|
||||
"item_ids": [left.id, right.id],
|
||||
"reason": reason,
|
||||
"score": round(score, 3),
|
||||
"confidence": "medium",
|
||||
**evidence,
|
||||
}
|
||||
)
|
||||
pair_counts[left.id] += 1
|
||||
pair_counts[right.id] += 1
|
||||
if len(recalled) >= int(config["max_pairs"]):
|
||||
break
|
||||
if len(recalled) >= int(config["max_pairs"]):
|
||||
break
|
||||
|
||||
candidates = existing_candidates + recalled
|
||||
report = {
|
||||
"enabled": True,
|
||||
"input_count": len(items),
|
||||
"existing_candidate_group_count": len(existing_candidates),
|
||||
"added_candidate_group_count": len(recalled),
|
||||
"candidate_group_count": len(candidates),
|
||||
"candidates": candidates,
|
||||
}
|
||||
return candidates, report
|
||||
@@ -1,6 +1,10 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import socket
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from urllib.error import HTTPError, URLError
|
||||
import urllib.request
|
||||
from typing import Any
|
||||
|
||||
@@ -8,10 +12,61 @@ from typing import Any
|
||||
UA = "Mozilla/5.0 (compatible; ai-daily-report/1.0)"
|
||||
|
||||
|
||||
def fetch_text(url: str, timeout_seconds: int) -> str:
|
||||
@dataclass
|
||||
class FetchTextError(Exception):
|
||||
error_type: str
|
||||
message: str
|
||||
http_status: int | None = None
|
||||
attempts: int = 1
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.message
|
||||
|
||||
|
||||
def _classify_fetch_exception(exc: Exception) -> tuple[str, int | None, bool]:
|
||||
if isinstance(exc, HTTPError):
|
||||
if exc.code == 404:
|
||||
return "http_404", exc.code, False
|
||||
if exc.code in {429, 500, 502, 503, 504}:
|
||||
return f"http_{exc.code}", exc.code, True
|
||||
return f"http_{exc.code}", exc.code, False
|
||||
if isinstance(exc, TimeoutError | socket.timeout):
|
||||
return "timeout", None, True
|
||||
if isinstance(exc, URLError):
|
||||
reason = exc.reason
|
||||
if isinstance(reason, TimeoutError | socket.timeout):
|
||||
return "timeout", None, True
|
||||
return "network_error", None, True
|
||||
return "fetch_error", None, False
|
||||
|
||||
|
||||
def fetch_text(
|
||||
url: str,
|
||||
timeout_seconds: int,
|
||||
*,
|
||||
retries: int = 0,
|
||||
backoff_seconds: float = 0.5,
|
||||
) -> str:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=timeout_seconds) as response:
|
||||
return response.read().decode("utf-8", "ignore")
|
||||
attempts = max(1, retries + 1)
|
||||
last_error: FetchTextError | None = None
|
||||
for attempt in range(1, attempts + 1):
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout_seconds) as response:
|
||||
return response.read().decode("utf-8", "ignore")
|
||||
except Exception as exc:
|
||||
error_type, http_status, retryable = _classify_fetch_exception(exc)
|
||||
last_error = FetchTextError(
|
||||
error_type=error_type,
|
||||
message=f"{type(exc).__name__}: {exc}",
|
||||
http_status=http_status,
|
||||
attempts=attempt,
|
||||
)
|
||||
if not retryable or attempt >= attempts:
|
||||
raise last_error from exc
|
||||
if backoff_seconds > 0:
|
||||
time.sleep(backoff_seconds * (2 ** (attempt - 1)))
|
||||
raise last_error or FetchTextError("fetch_error", "unknown fetch error", attempts=attempts)
|
||||
|
||||
|
||||
class OpenAICompatibleClient:
|
||||
@@ -60,5 +115,17 @@ class BlogApiClient:
|
||||
def create_post(self, payload: dict[str, Any]) -> dict[str, Any]:
|
||||
return self._request("POST", "/api/service/posts", payload)
|
||||
|
||||
def get_post_by_slug(self, slug: str) -> dict[str, Any] | None:
|
||||
try:
|
||||
return self._request("GET", f"/api/service/posts/{slug}")
|
||||
except HTTPError as exc:
|
||||
if exc.code == 404:
|
||||
return None
|
||||
raise
|
||||
except FetchTextError as exc:
|
||||
if exc.error_type == "http_404":
|
||||
return None
|
||||
raise
|
||||
|
||||
def publish_post(self, slug: str) -> None:
|
||||
self._request("POST", f"/api/service/posts/{slug}/publish")
|
||||
|
||||
@@ -5,6 +5,7 @@ from datetime import datetime, timezone
|
||||
from time import perf_counter
|
||||
from typing import Callable, Iterable, Any
|
||||
|
||||
from .clients import FetchTextError
|
||||
from .models import SourceConfig, SourceResult
|
||||
|
||||
|
||||
@@ -12,11 +13,19 @@ Fetcher = Callable[[SourceConfig, str], list[dict[str, Any]]]
|
||||
|
||||
|
||||
def _status_from_exception(exc: Exception) -> str:
|
||||
if isinstance(exc, FetchTextError):
|
||||
return exc.error_type
|
||||
if isinstance(exc, TimeoutError):
|
||||
return "timeout"
|
||||
return "error"
|
||||
|
||||
|
||||
def _retry_count_from_exception(exc: Exception) -> int:
|
||||
if isinstance(exc, FetchTextError):
|
||||
return max(0, exc.attempts - 1)
|
||||
return 0
|
||||
|
||||
|
||||
def _collect_one(config: SourceConfig, run_date: str, fetcher: Fetcher) -> SourceResult:
|
||||
fetched_at = datetime.now(timezone.utc).isoformat()
|
||||
if not config.enabled:
|
||||
@@ -51,6 +60,7 @@ def _collect_one(config: SourceConfig, run_date: str, fetcher: Fetcher) -> Sourc
|
||||
status=_status_from_exception(exc),
|
||||
error=f"{type(exc).__name__}: {exc}",
|
||||
elapsed_ms=elapsed_ms,
|
||||
retry_count=_retry_count_from_exception(exc),
|
||||
fetched_at=fetched_at,
|
||||
)
|
||||
|
||||
@@ -91,5 +101,10 @@ def collect_sources(
|
||||
"raw_item_count": sum(len(result.items) for result in results),
|
||||
"source_counts": {result.source: len(result.items) for result in results},
|
||||
"statuses": {result.source: result.status for result in results},
|
||||
"error_types": {
|
||||
result.source: result.status
|
||||
for result in results
|
||||
if not result.ok and result.status != "disabled"
|
||||
},
|
||||
}
|
||||
return results, report
|
||||
|
||||
@@ -3,6 +3,7 @@ from __future__ import annotations
|
||||
from typing import Any
|
||||
|
||||
from .assemble import assemble_markdown
|
||||
from .candidate_recall import recall_semantic_candidates
|
||||
from .classify import classify_and_order_items
|
||||
from .collect import Fetcher, collect_sources
|
||||
from .dedupe import cross_day_dedup_items, hard_dedup_items
|
||||
@@ -10,6 +11,7 @@ from .guide import GuideLlmCall, generate_guide
|
||||
from .models import PublishedUrls, SourceConfig
|
||||
from .normalize import normalize_items
|
||||
from .publish import BlogClient, publish_markdown
|
||||
from .quality_gate import evaluate_quality_gate
|
||||
from .rewrite import RewriteLlmCall, rewrite_items
|
||||
from .semantic_dedupe import SemanticLlmCall, semantic_dedup_items
|
||||
|
||||
@@ -49,6 +51,11 @@ def run_stage0_to_stage2(
|
||||
source_priorities=source_priorities,
|
||||
)
|
||||
deduped_items, stage2_report = hard_dedup_items(normalized_items)
|
||||
artifacts = {
|
||||
"stage0_sources": source_results,
|
||||
"stage1_items": normalized_items,
|
||||
"stage2_items": deduped_items,
|
||||
}
|
||||
return {
|
||||
"source_results": source_results,
|
||||
"items": deduped_items,
|
||||
@@ -57,6 +64,7 @@ def run_stage0_to_stage2(
|
||||
"stage1": stage1_report,
|
||||
"stage2": stage2_report,
|
||||
},
|
||||
"artifacts": artifacts,
|
||||
}
|
||||
|
||||
|
||||
@@ -90,10 +98,13 @@ def run_stage0_to_stage2_5(
|
||||
reports = dict(stage2_result["reports"])
|
||||
stage2_5_report.setdefault("enabled", cross_day_dedup_enabled)
|
||||
reports["stage2_5"] = stage2_5_report
|
||||
artifacts = dict(stage2_result.get("artifacts", {}))
|
||||
artifacts["stage2_5_items"] = items
|
||||
return {
|
||||
"source_results": stage2_result["source_results"],
|
||||
"items": items,
|
||||
"reports": reports,
|
||||
"artifacts": artifacts,
|
||||
}
|
||||
|
||||
|
||||
@@ -107,6 +118,10 @@ def run_stage0_to_stage4(
|
||||
published_urls: PublishedUrls | None = None,
|
||||
cross_day_dedup_enabled: bool = True,
|
||||
cross_day_dedup_max_age_days: int = 7,
|
||||
semantic_dedup_max_deletion_ratio: float = 0.5,
|
||||
rewrite_batch_size: int = 30,
|
||||
semantic_candidate_recall_config: dict[str, Any] | None = None,
|
||||
quality_gate_config: dict[str, Any] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
stage2_5_result = run_stage0_to_stage2_5(
|
||||
source_configs,
|
||||
@@ -123,22 +138,35 @@ def run_stage0_to_stage4(
|
||||
for candidate in stage2_5_result["reports"]["stage2"].get("possible_duplicates", [])
|
||||
if set(candidate.get("item_ids", [])).issubset(remaining_ids)
|
||||
]
|
||||
candidates, stage2_8_report = recall_semantic_candidates(
|
||||
items,
|
||||
existing_candidates=candidates,
|
||||
config=semantic_candidate_recall_config,
|
||||
)
|
||||
semantic_items, stage3_report = semantic_dedup_items(
|
||||
items,
|
||||
candidates,
|
||||
llm_call=semantic_llm_call,
|
||||
max_deletion_ratio=semantic_dedup_max_deletion_ratio,
|
||||
)
|
||||
rewritten_items, stage4_report = rewrite_items(
|
||||
semantic_items,
|
||||
llm_call=rewrite_llm_call,
|
||||
batch_size=rewrite_batch_size,
|
||||
)
|
||||
reports = dict(stage2_5_result["reports"])
|
||||
reports["stage2_8"] = stage2_8_report
|
||||
reports["stage3"] = stage3_report
|
||||
reports["stage4"] = stage4_report
|
||||
artifacts = dict(stage2_5_result.get("artifacts", {}))
|
||||
artifacts["stage2_8_candidates"] = candidates
|
||||
artifacts["stage3_items"] = semantic_items
|
||||
artifacts["stage4_items"] = rewritten_items
|
||||
return {
|
||||
"source_results": stage2_5_result["source_results"],
|
||||
"items": rewritten_items,
|
||||
"reports": reports,
|
||||
"artifacts": artifacts,
|
||||
}
|
||||
|
||||
|
||||
@@ -152,6 +180,10 @@ def run_stage0_to_stage5(
|
||||
published_urls: PublishedUrls | None = None,
|
||||
cross_day_dedup_enabled: bool = True,
|
||||
cross_day_dedup_max_age_days: int = 7,
|
||||
semantic_dedup_max_deletion_ratio: float = 0.5,
|
||||
rewrite_batch_size: int = 30,
|
||||
semantic_candidate_recall_config: dict[str, Any] | None = None,
|
||||
quality_gate_config: dict[str, Any] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
stage4_result = run_stage0_to_stage4(
|
||||
source_configs,
|
||||
@@ -162,6 +194,9 @@ def run_stage0_to_stage5(
|
||||
published_urls=published_urls,
|
||||
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
||||
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
||||
semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio,
|
||||
rewrite_batch_size=rewrite_batch_size,
|
||||
semantic_candidate_recall_config=semantic_candidate_recall_config,
|
||||
)
|
||||
classified_items, stage5_report = classify_and_order_items(stage4_result["items"])
|
||||
reports = dict(stage4_result["reports"])
|
||||
@@ -170,6 +205,7 @@ def run_stage0_to_stage5(
|
||||
"source_results": stage4_result["source_results"],
|
||||
"items": classified_items,
|
||||
"reports": reports,
|
||||
"artifacts": stage4_result.get("artifacts", {}),
|
||||
}
|
||||
|
||||
|
||||
@@ -184,6 +220,9 @@ def run_stage0_to_stage6(
|
||||
published_urls: PublishedUrls | None = None,
|
||||
cross_day_dedup_enabled: bool = True,
|
||||
cross_day_dedup_max_age_days: int = 7,
|
||||
semantic_dedup_max_deletion_ratio: float = 0.5,
|
||||
rewrite_batch_size: int = 30,
|
||||
semantic_candidate_recall_config: dict[str, Any] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
stage5_result = run_stage0_to_stage5(
|
||||
source_configs,
|
||||
@@ -194,6 +233,9 @@ def run_stage0_to_stage6(
|
||||
published_urls=published_urls,
|
||||
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
||||
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
||||
semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio,
|
||||
rewrite_batch_size=rewrite_batch_size,
|
||||
semantic_candidate_recall_config=semantic_candidate_recall_config,
|
||||
)
|
||||
guide, stage6_report = generate_guide(stage5_result["items"], llm_call=guide_llm_call)
|
||||
reports = dict(stage5_result["reports"])
|
||||
@@ -203,6 +245,7 @@ def run_stage0_to_stage6(
|
||||
"items": stage5_result["items"],
|
||||
"guide": guide,
|
||||
"reports": reports,
|
||||
"artifacts": stage5_result.get("artifacts", {}),
|
||||
}
|
||||
|
||||
|
||||
@@ -217,6 +260,10 @@ def run_stage0_to_stage7(
|
||||
published_urls: PublishedUrls | None = None,
|
||||
cross_day_dedup_enabled: bool = True,
|
||||
cross_day_dedup_max_age_days: int = 7,
|
||||
semantic_dedup_max_deletion_ratio: float = 0.5,
|
||||
rewrite_batch_size: int = 30,
|
||||
semantic_candidate_recall_config: dict[str, Any] | None = None,
|
||||
quality_gate_config: dict[str, Any] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
stage6_result = run_stage0_to_stage6(
|
||||
source_configs,
|
||||
@@ -228,6 +275,9 @@ def run_stage0_to_stage7(
|
||||
published_urls=published_urls,
|
||||
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
||||
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
||||
semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio,
|
||||
rewrite_batch_size=rewrite_batch_size,
|
||||
semantic_candidate_recall_config=semantic_candidate_recall_config,
|
||||
)
|
||||
markdown, stage7_report = assemble_markdown(stage6_result["items"], stage6_result["guide"])
|
||||
upstream_blocking_errors: list[str] = []
|
||||
@@ -238,13 +288,26 @@ def run_stage0_to_stage7(
|
||||
existing_errors = list(stage7_report.get("blocking_errors", []) or [])
|
||||
stage7_report["blocking_errors"] = existing_errors + upstream_blocking_errors
|
||||
reports = dict(stage6_result["reports"])
|
||||
quality_gate_report = evaluate_quality_gate(
|
||||
stage6_result["items"],
|
||||
source_results=stage6_result["source_results"],
|
||||
reports=reports,
|
||||
config=quality_gate_config,
|
||||
)
|
||||
if quality_gate_report.get("blocking_errors"):
|
||||
existing_errors = list(stage7_report.get("blocking_errors", []) or [])
|
||||
stage7_report["blocking_errors"] = existing_errors + list(quality_gate_report["blocking_errors"])
|
||||
reports["quality_gate"] = quality_gate_report
|
||||
reports["stage7"] = stage7_report
|
||||
artifacts = dict(stage6_result.get("artifacts", {}))
|
||||
artifacts["quality_gate"] = quality_gate_report
|
||||
return {
|
||||
"source_results": stage6_result["source_results"],
|
||||
"items": stage6_result["items"],
|
||||
"guide": stage6_result["guide"],
|
||||
"markdown": markdown,
|
||||
"reports": reports,
|
||||
"artifacts": artifacts,
|
||||
}
|
||||
|
||||
|
||||
@@ -262,6 +325,11 @@ def run_stage0_to_stage8(
|
||||
published_urls: PublishedUrls | None = None,
|
||||
cross_day_dedup_enabled: bool = True,
|
||||
cross_day_dedup_max_age_days: int = 7,
|
||||
semantic_dedup_max_deletion_ratio: float = 0.5,
|
||||
rewrite_batch_size: int = 30,
|
||||
semantic_candidate_recall_config: dict[str, Any] | None = None,
|
||||
quality_gate_config: dict[str, Any] | None = None,
|
||||
publish_idempotency_config: dict[str, Any] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
stage7_result = run_stage0_to_stage7(
|
||||
source_configs,
|
||||
@@ -273,6 +341,10 @@ def run_stage0_to_stage8(
|
||||
published_urls=published_urls,
|
||||
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
||||
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
||||
semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio,
|
||||
rewrite_batch_size=rewrite_batch_size,
|
||||
semantic_candidate_recall_config=semantic_candidate_recall_config,
|
||||
quality_gate_config=quality_gate_config,
|
||||
)
|
||||
slug = f"ai-{run_date}"
|
||||
publish_result = publish_markdown(
|
||||
@@ -284,6 +356,7 @@ def run_stage0_to_stage8(
|
||||
mode=mode,
|
||||
markdown_report=stage7_result["reports"]["stage7"],
|
||||
client=client,
|
||||
idempotency_config=publish_idempotency_config,
|
||||
)
|
||||
reports = dict(stage7_result["reports"])
|
||||
reports["stage8"] = {
|
||||
@@ -301,4 +374,5 @@ def run_stage0_to_stage8(
|
||||
"markdown": stage7_result["markdown"],
|
||||
"publish": publish_result,
|
||||
"reports": reports,
|
||||
"artifacts": stage7_result.get("artifacts", {}),
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import hashlib
|
||||
from dataclasses import dataclass
|
||||
from datetime import date, datetime, timezone
|
||||
from pathlib import Path
|
||||
@@ -20,6 +21,9 @@ class PublishResult:
|
||||
|
||||
|
||||
class BlogClient(Protocol):
|
||||
def get_post_by_slug(self, slug: str) -> dict[str, Any] | None:
|
||||
...
|
||||
|
||||
def create_post(self, payload: dict[str, Any]) -> dict[str, Any]:
|
||||
...
|
||||
|
||||
@@ -153,6 +157,18 @@ def dry_run_publish(slug: str, base_url: str) -> PublishResult:
|
||||
)
|
||||
|
||||
|
||||
def _content_hash(value: str) -> str:
|
||||
return hashlib.sha256((value or "").encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def _get_existing_post(client: BlogClient, slug: str) -> dict[str, Any] | None:
|
||||
getter = getattr(client, "get_post_by_slug", None)
|
||||
if getter is None:
|
||||
return None
|
||||
existing = getter(slug)
|
||||
return existing if isinstance(existing, dict) else None
|
||||
|
||||
|
||||
def publish_markdown(
|
||||
*,
|
||||
title: str,
|
||||
@@ -163,6 +179,7 @@ def publish_markdown(
|
||||
mode: str,
|
||||
markdown_report: dict[str, Any],
|
||||
client: BlogClient | None,
|
||||
idempotency_config: dict[str, Any] | None = None,
|
||||
) -> PublishResult:
|
||||
blocking_errors = markdown_report.get("blocking_errors", []) or []
|
||||
blog_url = f"{base_url.rstrip('/')}/posts/{slug}"
|
||||
@@ -187,6 +204,39 @@ def publish_markdown(
|
||||
error="missing_blog_client",
|
||||
)
|
||||
|
||||
idempotency_config = idempotency_config or {}
|
||||
if bool(idempotency_config.get("enabled", False)):
|
||||
try:
|
||||
existing_post = _get_existing_post(client, slug)
|
||||
except Exception as exc:
|
||||
return PublishResult(
|
||||
mode=mode,
|
||||
status="failed",
|
||||
slug=slug,
|
||||
blog_url=blog_url,
|
||||
public_ok=False,
|
||||
error=f"idempotency_check_failed:{type(exc).__name__}: {exc}",
|
||||
)
|
||||
if existing_post is not None:
|
||||
existing_content = str(existing_post.get("content") or existing_post.get("markdown") or "")
|
||||
if _content_hash(existing_content) == _content_hash(markdown):
|
||||
return PublishResult(
|
||||
mode=mode,
|
||||
status="already_published",
|
||||
slug=slug,
|
||||
blog_url=blog_url,
|
||||
public_ok=True,
|
||||
)
|
||||
if not bool(idempotency_config.get("allow_republish", False)):
|
||||
return PublishResult(
|
||||
mode=mode,
|
||||
status="blocked",
|
||||
slug=slug,
|
||||
blog_url=blog_url,
|
||||
public_ok=False,
|
||||
error="slug_already_exists",
|
||||
)
|
||||
|
||||
payload = {"title": title, "content": markdown, "tags": tags, "slug": slug}
|
||||
try:
|
||||
create_resp = client.create_post(payload)
|
||||
|
||||
91
ai_daily_report/quality_gate.py
Normal file
91
ai_daily_report/quality_gate.py
Normal file
@@ -0,0 +1,91 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import difflib
|
||||
from typing import Any
|
||||
|
||||
from .dedupe import _title_tokens
|
||||
from .models import NewsItem, SourceResult
|
||||
|
||||
|
||||
DEFAULT_CONFIG = {
|
||||
"block_on_required_source_failure": True,
|
||||
"warn_on_enabled_source_failure": True,
|
||||
"warn_when_stage3_candidates_zero_min_items": 30,
|
||||
"warn_on_final_title_similarity": 0.55,
|
||||
"warn_on_entity_frequency": 3,
|
||||
"required_sources": [],
|
||||
}
|
||||
|
||||
|
||||
def _config(config: dict[str, Any] | None) -> dict[str, Any]:
|
||||
return {**DEFAULT_CONFIG, **(config or {})}
|
||||
|
||||
|
||||
def _source_failures(source_results: list[SourceResult]) -> list[dict[str, Any]]:
|
||||
failures: list[dict[str, Any]] = []
|
||||
for result in source_results:
|
||||
if result.ok or result.status == "disabled":
|
||||
continue
|
||||
failures.append(
|
||||
{
|
||||
"source": result.source,
|
||||
"role": result.role,
|
||||
"status": result.status,
|
||||
"error": result.error,
|
||||
}
|
||||
)
|
||||
return failures
|
||||
|
||||
|
||||
def _similar_title_warnings(items: list[NewsItem], threshold: float) -> list[str]:
|
||||
warnings: list[str] = []
|
||||
for index, left in enumerate(items):
|
||||
left_title = left.title or left.title_raw
|
||||
for right in items[index + 1 :]:
|
||||
right_title = right.title or right.title_raw
|
||||
if len(_title_tokens(left_title)) < 2 or len(_title_tokens(right_title)) < 2:
|
||||
continue
|
||||
ratio = difflib.SequenceMatcher(None, left_title.lower(), right_title.lower()).ratio()
|
||||
if ratio >= threshold:
|
||||
warnings.append(f"final_title_similarity:{left.id}:{right.id}:{ratio:.3f}")
|
||||
return warnings
|
||||
|
||||
|
||||
def evaluate_quality_gate(
|
||||
items: list[NewsItem],
|
||||
*,
|
||||
source_results: list[SourceResult],
|
||||
reports: dict[str, Any],
|
||||
config: dict[str, Any] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
config = _config(config)
|
||||
warnings: list[str] = []
|
||||
blocking_errors: list[str] = []
|
||||
|
||||
stage3_report = reports.get("stage3", {}) or {}
|
||||
min_items = int(config["warn_when_stage3_candidates_zero_min_items"])
|
||||
if len(items) > min_items and int(stage3_report.get("candidate_group_count", 0)) == 0:
|
||||
warnings.append("stage3_candidates_zero")
|
||||
|
||||
failures = _source_failures(source_results)
|
||||
if bool(config["warn_on_enabled_source_failure"]):
|
||||
for failure in failures:
|
||||
warnings.append(f"enabled_source_failed:{failure['source']}:{failure['status']}")
|
||||
|
||||
required_sources = set(config.get("required_sources") or [])
|
||||
if bool(config["block_on_required_source_failure"]):
|
||||
for failure in failures:
|
||||
if failure["source"] in required_sources:
|
||||
blocking_errors.append(f"required_source_failed:{failure['source']}:{failure['status']}")
|
||||
|
||||
title_threshold = float(config["warn_on_final_title_similarity"])
|
||||
if title_threshold > 0:
|
||||
warnings.extend(_similar_title_warnings(items, title_threshold))
|
||||
|
||||
return {
|
||||
"input_count": len(items),
|
||||
"warnings": warnings,
|
||||
"blocking_errors": blocking_errors,
|
||||
"source_failures": failures,
|
||||
"quality_gate_failed": bool(blocking_errors),
|
||||
}
|
||||
@@ -104,6 +104,11 @@ def run_daily_report(
|
||||
cross_day_config = pipeline_config.get("cross_day_dedup", {}) or {}
|
||||
cross_day_enabled = bool(cross_day_config.get("enabled", True))
|
||||
cross_day_max_age_days = int(cross_day_config.get("max_age_days", 7))
|
||||
semantic_dedup_max_deletion_ratio = float(pipeline_config.get("semantic_dedup_max_deletion_ratio", 0.5))
|
||||
rewrite_batch_size = int(pipeline_config.get("rewrite_batch_size", 30))
|
||||
semantic_candidate_recall_config = pipeline_config.get("semantic_candidate_recall", {}) or {}
|
||||
quality_gate_config = pipeline_config.get("quality_gate", {}) or {}
|
||||
publish_idempotency_config = pipeline_config.get("publish_idempotency", {}) or {}
|
||||
configured_history_path = history_path or Path(
|
||||
str(cross_day_config.get("history_path") or "~/.hermes/scripts/ai_morning_out/published_urls.json")
|
||||
).expanduser()
|
||||
@@ -119,7 +124,13 @@ def run_daily_report(
|
||||
|
||||
def fetcher(config: SourceConfig, current_date: str) -> list[dict[str, Any]]:
|
||||
source_fetcher = get_source_fetcher(config.type)
|
||||
return source_fetcher(config, current_date, fetch_text)
|
||||
def configured_fetch_text(url: str, timeout_seconds: int) -> str:
|
||||
try:
|
||||
return fetch_text(url, timeout_seconds, retries=config.retries)
|
||||
except TypeError:
|
||||
return fetch_text(url, timeout_seconds)
|
||||
|
||||
return source_fetcher(config, current_date, configured_fetch_text)
|
||||
|
||||
else:
|
||||
raise ValueError("source_mode must be 'mock' or 'live'")
|
||||
@@ -156,6 +167,11 @@ def run_daily_report(
|
||||
published_urls=published_urls,
|
||||
cross_day_dedup_enabled=cross_day_enabled,
|
||||
cross_day_dedup_max_age_days=cross_day_max_age_days,
|
||||
semantic_dedup_max_deletion_ratio=semantic_dedup_max_deletion_ratio,
|
||||
rewrite_batch_size=rewrite_batch_size,
|
||||
semantic_candidate_recall_config=semantic_candidate_recall_config,
|
||||
quality_gate_config=quality_gate_config,
|
||||
publish_idempotency_config=publish_idempotency_config,
|
||||
)
|
||||
|
||||
if cross_day_enabled and result["publish"].mode == "publish" and result["publish"].status == "ok":
|
||||
@@ -173,9 +189,15 @@ def run_daily_report(
|
||||
json.dumps(result["reports"], ensure_ascii=False, indent=2, default=_json_default),
|
||||
encoding="utf-8",
|
||||
)
|
||||
for artifact_name, artifact_value in result.get("artifacts", {}).items():
|
||||
(run_dir / f"{artifact_name}.json").write_text(
|
||||
json.dumps(artifact_value, ensure_ascii=False, indent=2, default=_json_default),
|
||||
encoding="utf-8",
|
||||
)
|
||||
return {
|
||||
"run_dir": str(run_dir),
|
||||
"markdown": result["markdown"],
|
||||
"reports": result["reports"],
|
||||
"publish": result["publish"],
|
||||
"artifacts": result.get("artifacts", {}),
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user