Files
ai-daily-report/ai_daily_report/publish.py

262 lines
7.8 KiB
Python

from __future__ import annotations
import json
import hashlib
from dataclasses import dataclass
from datetime import date, datetime, timezone
from pathlib import Path
from typing import Any, Protocol
from .models import NewsItem, PublishedUrlEntry, PublishedUrls
@dataclass
class PublishResult:
mode: str
status: str
slug: str
blog_url: str
public_ok: bool = False
error: str | None = None
class BlogClient(Protocol):
def get_post_by_slug(self, slug: str) -> dict[str, Any] | None:
...
def create_post(self, payload: dict[str, Any]) -> dict[str, Any]:
...
def publish_post(self, slug: str) -> None:
...
def _parse_date(value: str | None) -> date | None:
if not value:
return None
text = value.strip()
try:
return date.fromisoformat(text[:10])
except ValueError:
try:
return datetime.fromisoformat(text).date()
except ValueError:
return None
def _published_entry_from_dict(value: Any) -> PublishedUrlEntry | None:
if not isinstance(value, dict):
return None
first_seen = str(value.get("first_seen") or "")
last_published = str(value.get("last_published") or first_seen)
titles = [str(title) for title in value.get("titles", []) or [] if str(title)]
if not first_seen and not last_published:
return None
return PublishedUrlEntry(
first_seen=first_seen or last_published,
last_published=last_published or first_seen,
titles=titles,
)
def load_published_urls(path: Path) -> PublishedUrls:
if not path.exists():
return PublishedUrls()
try:
raw = json.loads(path.read_text(encoding="utf-8"))
except Exception:
return PublishedUrls()
if not isinstance(raw, dict):
return PublishedUrls()
urls: dict[str, PublishedUrlEntry] = {}
for canonical_url, value in (raw.get("urls") or {}).items():
if not canonical_url:
continue
entry = _published_entry_from_dict(value)
if entry is not None:
urls[str(canonical_url)] = entry
return PublishedUrls(
version=int(raw.get("version") or 1),
urls=urls,
updated_at=str(raw.get("updated_at") or ""),
)
def _entry_within_window(entry: PublishedUrlEntry, *, run_date: str, max_age_days: int) -> bool:
if max_age_days < 0:
return True
current = _parse_date(run_date)
previous = _parse_date(entry.last_published) or _parse_date(entry.first_seen)
if current is None or previous is None:
return True
return (current - previous).days <= max_age_days
def _published_urls_to_dict(history: PublishedUrls) -> dict[str, Any]:
return {
"version": history.version,
"urls": {
canonical_url: {
"first_seen": entry.first_seen,
"last_published": entry.last_published,
"titles": entry.titles,
}
for canonical_url, entry in sorted(history.urls.items())
},
"updated_at": history.updated_at,
}
def update_published_urls(
path: Path,
items: list[NewsItem],
*,
run_date: str,
max_age_days: int = 7,
) -> PublishedUrls:
history = load_published_urls(path)
history.urls = {
canonical_url: entry
for canonical_url, entry in history.urls.items()
if _entry_within_window(entry, run_date=run_date, max_age_days=max_age_days)
}
for item in items:
if not item.canonical_url:
continue
title = item.title or item.title_raw
entry = history.urls.get(item.canonical_url)
if entry is None:
entry = PublishedUrlEntry(
first_seen=run_date,
last_published=run_date,
titles=[],
)
history.urls[item.canonical_url] = entry
entry.last_published = run_date
if title and title not in entry.titles:
entry.titles.append(title)
history.updated_at = datetime.now(timezone.utc).isoformat()
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(
json.dumps(_published_urls_to_dict(history), ensure_ascii=False, indent=2),
encoding="utf-8",
)
return history
def dry_run_publish(slug: str, base_url: str) -> PublishResult:
return PublishResult(
mode="dry-run",
status="ok",
slug=slug,
blog_url=f"{base_url.rstrip('/')}/posts/{slug}",
public_ok=True,
)
def _content_hash(value: str) -> str:
return hashlib.sha256((value or "").encode("utf-8")).hexdigest()
def _get_existing_post(client: BlogClient, slug: str) -> dict[str, Any] | None:
getter = getattr(client, "get_post_by_slug", None)
if getter is None:
return None
existing = getter(slug)
return existing if isinstance(existing, dict) else None
def publish_markdown(
*,
title: str,
markdown: str,
tags: list[str],
slug: str,
base_url: str,
mode: str,
markdown_report: dict[str, Any],
client: BlogClient | None,
idempotency_config: dict[str, Any] | None = None,
) -> PublishResult:
blocking_errors = markdown_report.get("blocking_errors", []) or []
blog_url = f"{base_url.rstrip('/')}/posts/{slug}"
if blocking_errors:
return PublishResult(
mode=mode,
status="blocked",
slug=slug,
blog_url=blog_url,
public_ok=False,
error=";".join(blocking_errors),
)
if mode == "dry-run":
return dry_run_publish(slug, base_url)
if client is None:
return PublishResult(
mode=mode,
status="failed",
slug=slug,
blog_url=blog_url,
public_ok=False,
error="missing_blog_client",
)
idempotency_config = idempotency_config or {}
if bool(idempotency_config.get("enabled", False)):
try:
existing_post = _get_existing_post(client, slug)
except Exception as exc:
return PublishResult(
mode=mode,
status="failed",
slug=slug,
blog_url=blog_url,
public_ok=False,
error=f"idempotency_check_failed:{type(exc).__name__}: {exc}",
)
if existing_post is not None:
existing_content = str(existing_post.get("content") or existing_post.get("markdown") or "")
if _content_hash(existing_content) == _content_hash(markdown):
return PublishResult(
mode=mode,
status="already_published",
slug=slug,
blog_url=blog_url,
public_ok=True,
)
if not bool(idempotency_config.get("allow_republish", False)):
return PublishResult(
mode=mode,
status="blocked",
slug=slug,
blog_url=blog_url,
public_ok=False,
error="slug_already_exists",
)
payload = {"title": title, "content": markdown, "tags": tags, "slug": slug}
try:
create_resp = client.create_post(payload)
created_slug = create_resp.get("slug") or slug
if mode == "publish":
client.publish_post(created_slug)
return PublishResult(
mode=mode,
status="ok",
slug=created_slug,
blog_url=f"{base_url.rstrip('/')}/posts/{created_slug}",
public_ok=mode == "publish",
)
except Exception as exc:
return PublishResult(
mode=mode,
status="failed",
slug=slug,
blog_url=blog_url,
public_ok=False,
error=f"{type(exc).__name__}: {exc}",
)