262 lines
7.8 KiB
Python
262 lines
7.8 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import hashlib
|
|
from dataclasses import dataclass
|
|
from datetime import date, datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Protocol
|
|
|
|
from .models import NewsItem, PublishedUrlEntry, PublishedUrls
|
|
|
|
|
|
@dataclass
|
|
class PublishResult:
|
|
mode: str
|
|
status: str
|
|
slug: str
|
|
blog_url: str
|
|
public_ok: bool = False
|
|
error: str | None = None
|
|
|
|
|
|
class BlogClient(Protocol):
|
|
def get_post_by_slug(self, slug: str) -> dict[str, Any] | None:
|
|
...
|
|
|
|
def create_post(self, payload: dict[str, Any]) -> dict[str, Any]:
|
|
...
|
|
|
|
def publish_post(self, slug: str) -> None:
|
|
...
|
|
|
|
|
|
def _parse_date(value: str | None) -> date | None:
|
|
if not value:
|
|
return None
|
|
text = value.strip()
|
|
try:
|
|
return date.fromisoformat(text[:10])
|
|
except ValueError:
|
|
try:
|
|
return datetime.fromisoformat(text).date()
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def _published_entry_from_dict(value: Any) -> PublishedUrlEntry | None:
|
|
if not isinstance(value, dict):
|
|
return None
|
|
first_seen = str(value.get("first_seen") or "")
|
|
last_published = str(value.get("last_published") or first_seen)
|
|
titles = [str(title) for title in value.get("titles", []) or [] if str(title)]
|
|
if not first_seen and not last_published:
|
|
return None
|
|
return PublishedUrlEntry(
|
|
first_seen=first_seen or last_published,
|
|
last_published=last_published or first_seen,
|
|
titles=titles,
|
|
)
|
|
|
|
|
|
def load_published_urls(path: Path) -> PublishedUrls:
|
|
if not path.exists():
|
|
return PublishedUrls()
|
|
try:
|
|
raw = json.loads(path.read_text(encoding="utf-8"))
|
|
except Exception:
|
|
return PublishedUrls()
|
|
if not isinstance(raw, dict):
|
|
return PublishedUrls()
|
|
|
|
urls: dict[str, PublishedUrlEntry] = {}
|
|
for canonical_url, value in (raw.get("urls") or {}).items():
|
|
if not canonical_url:
|
|
continue
|
|
entry = _published_entry_from_dict(value)
|
|
if entry is not None:
|
|
urls[str(canonical_url)] = entry
|
|
return PublishedUrls(
|
|
version=int(raw.get("version") or 1),
|
|
urls=urls,
|
|
updated_at=str(raw.get("updated_at") or ""),
|
|
)
|
|
|
|
|
|
def _entry_within_window(entry: PublishedUrlEntry, *, run_date: str, max_age_days: int) -> bool:
|
|
if max_age_days < 0:
|
|
return True
|
|
current = _parse_date(run_date)
|
|
previous = _parse_date(entry.last_published) or _parse_date(entry.first_seen)
|
|
if current is None or previous is None:
|
|
return True
|
|
return (current - previous).days <= max_age_days
|
|
|
|
|
|
def _published_urls_to_dict(history: PublishedUrls) -> dict[str, Any]:
|
|
return {
|
|
"version": history.version,
|
|
"urls": {
|
|
canonical_url: {
|
|
"first_seen": entry.first_seen,
|
|
"last_published": entry.last_published,
|
|
"titles": entry.titles,
|
|
}
|
|
for canonical_url, entry in sorted(history.urls.items())
|
|
},
|
|
"updated_at": history.updated_at,
|
|
}
|
|
|
|
|
|
def update_published_urls(
|
|
path: Path,
|
|
items: list[NewsItem],
|
|
*,
|
|
run_date: str,
|
|
max_age_days: int = 7,
|
|
) -> PublishedUrls:
|
|
history = load_published_urls(path)
|
|
history.urls = {
|
|
canonical_url: entry
|
|
for canonical_url, entry in history.urls.items()
|
|
if _entry_within_window(entry, run_date=run_date, max_age_days=max_age_days)
|
|
}
|
|
|
|
for item in items:
|
|
if not item.canonical_url:
|
|
continue
|
|
title = item.title or item.title_raw
|
|
entry = history.urls.get(item.canonical_url)
|
|
if entry is None:
|
|
entry = PublishedUrlEntry(
|
|
first_seen=run_date,
|
|
last_published=run_date,
|
|
titles=[],
|
|
)
|
|
history.urls[item.canonical_url] = entry
|
|
entry.last_published = run_date
|
|
if title and title not in entry.titles:
|
|
entry.titles.append(title)
|
|
|
|
history.updated_at = datetime.now(timezone.utc).isoformat()
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
path.write_text(
|
|
json.dumps(_published_urls_to_dict(history), ensure_ascii=False, indent=2),
|
|
encoding="utf-8",
|
|
)
|
|
return history
|
|
|
|
|
|
def dry_run_publish(slug: str, base_url: str) -> PublishResult:
|
|
return PublishResult(
|
|
mode="dry-run",
|
|
status="ok",
|
|
slug=slug,
|
|
blog_url=f"{base_url.rstrip('/')}/posts/{slug}",
|
|
public_ok=True,
|
|
)
|
|
|
|
|
|
def _content_hash(value: str) -> str:
|
|
return hashlib.sha256((value or "").encode("utf-8")).hexdigest()
|
|
|
|
|
|
def _get_existing_post(client: BlogClient, slug: str) -> dict[str, Any] | None:
|
|
getter = getattr(client, "get_post_by_slug", None)
|
|
if getter is None:
|
|
return None
|
|
existing = getter(slug)
|
|
return existing if isinstance(existing, dict) else None
|
|
|
|
|
|
def publish_markdown(
|
|
*,
|
|
title: str,
|
|
markdown: str,
|
|
tags: list[str],
|
|
slug: str,
|
|
base_url: str,
|
|
mode: str,
|
|
markdown_report: dict[str, Any],
|
|
client: BlogClient | None,
|
|
idempotency_config: dict[str, Any] | None = None,
|
|
) -> PublishResult:
|
|
blocking_errors = markdown_report.get("blocking_errors", []) or []
|
|
blog_url = f"{base_url.rstrip('/')}/posts/{slug}"
|
|
if blocking_errors:
|
|
return PublishResult(
|
|
mode=mode,
|
|
status="blocked",
|
|
slug=slug,
|
|
blog_url=blog_url,
|
|
public_ok=False,
|
|
error=";".join(blocking_errors),
|
|
)
|
|
if mode == "dry-run":
|
|
return dry_run_publish(slug, base_url)
|
|
if client is None:
|
|
return PublishResult(
|
|
mode=mode,
|
|
status="failed",
|
|
slug=slug,
|
|
blog_url=blog_url,
|
|
public_ok=False,
|
|
error="missing_blog_client",
|
|
)
|
|
|
|
idempotency_config = idempotency_config or {}
|
|
if bool(idempotency_config.get("enabled", False)):
|
|
try:
|
|
existing_post = _get_existing_post(client, slug)
|
|
except Exception as exc:
|
|
return PublishResult(
|
|
mode=mode,
|
|
status="failed",
|
|
slug=slug,
|
|
blog_url=blog_url,
|
|
public_ok=False,
|
|
error=f"idempotency_check_failed:{type(exc).__name__}: {exc}",
|
|
)
|
|
if existing_post is not None:
|
|
existing_content = str(existing_post.get("content") or existing_post.get("markdown") or "")
|
|
if _content_hash(existing_content) == _content_hash(markdown):
|
|
return PublishResult(
|
|
mode=mode,
|
|
status="already_published",
|
|
slug=slug,
|
|
blog_url=blog_url,
|
|
public_ok=True,
|
|
)
|
|
if not bool(idempotency_config.get("allow_republish", False)):
|
|
return PublishResult(
|
|
mode=mode,
|
|
status="blocked",
|
|
slug=slug,
|
|
blog_url=blog_url,
|
|
public_ok=False,
|
|
error="slug_already_exists",
|
|
)
|
|
|
|
payload = {"title": title, "content": markdown, "tags": tags, "slug": slug}
|
|
try:
|
|
create_resp = client.create_post(payload)
|
|
created_slug = create_resp.get("slug") or slug
|
|
if mode == "publish":
|
|
client.publish_post(created_slug)
|
|
return PublishResult(
|
|
mode=mode,
|
|
status="ok",
|
|
slug=created_slug,
|
|
blog_url=f"{base_url.rstrip('/')}/posts/{created_slug}",
|
|
public_ok=mode == "publish",
|
|
)
|
|
except Exception as exc:
|
|
return PublishResult(
|
|
mode=mode,
|
|
status="failed",
|
|
slug=slug,
|
|
blog_url=blog_url,
|
|
public_ok=False,
|
|
error=f"{type(exc).__name__}: {exc}",
|
|
)
|