from __future__ import annotations import json import hashlib from dataclasses import dataclass from datetime import date, datetime, timezone from pathlib import Path from typing import Any, Protocol from .models import NewsItem, PublishedUrlEntry, PublishedUrls @dataclass class PublishResult: mode: str status: str slug: str blog_url: str public_ok: bool = False error: str | None = None class BlogClient(Protocol): def get_post_by_slug(self, slug: str) -> dict[str, Any] | None: ... def create_post(self, payload: dict[str, Any]) -> dict[str, Any]: ... def publish_post(self, slug: str) -> None: ... def _parse_date(value: str | None) -> date | None: if not value: return None text = value.strip() try: return date.fromisoformat(text[:10]) except ValueError: try: return datetime.fromisoformat(text).date() except ValueError: return None def _published_entry_from_dict(value: Any) -> PublishedUrlEntry | None: if not isinstance(value, dict): return None first_seen = str(value.get("first_seen") or "") last_published = str(value.get("last_published") or first_seen) titles = [str(title) for title in value.get("titles", []) or [] if str(title)] if not first_seen and not last_published: return None return PublishedUrlEntry( first_seen=first_seen or last_published, last_published=last_published or first_seen, titles=titles, ) def load_published_urls(path: Path) -> PublishedUrls: if not path.exists(): return PublishedUrls() try: raw = json.loads(path.read_text(encoding="utf-8")) except Exception: return PublishedUrls() if not isinstance(raw, dict): return PublishedUrls() urls: dict[str, PublishedUrlEntry] = {} for canonical_url, value in (raw.get("urls") or {}).items(): if not canonical_url: continue entry = _published_entry_from_dict(value) if entry is not None: urls[str(canonical_url)] = entry return PublishedUrls( version=int(raw.get("version") or 1), urls=urls, updated_at=str(raw.get("updated_at") or ""), ) def _entry_within_window(entry: PublishedUrlEntry, *, run_date: str, max_age_days: int) -> bool: if max_age_days < 0: return True current = _parse_date(run_date) previous = _parse_date(entry.last_published) or _parse_date(entry.first_seen) if current is None or previous is None: return True return (current - previous).days <= max_age_days def _published_urls_to_dict(history: PublishedUrls) -> dict[str, Any]: return { "version": history.version, "urls": { canonical_url: { "first_seen": entry.first_seen, "last_published": entry.last_published, "titles": entry.titles, } for canonical_url, entry in sorted(history.urls.items()) }, "updated_at": history.updated_at, } def update_published_urls( path: Path, items: list[NewsItem], *, run_date: str, max_age_days: int = 7, ) -> PublishedUrls: history = load_published_urls(path) history.urls = { canonical_url: entry for canonical_url, entry in history.urls.items() if _entry_within_window(entry, run_date=run_date, max_age_days=max_age_days) } for item in items: if not item.canonical_url: continue title = item.title or item.title_raw entry = history.urls.get(item.canonical_url) if entry is None: entry = PublishedUrlEntry( first_seen=run_date, last_published=run_date, titles=[], ) history.urls[item.canonical_url] = entry entry.last_published = run_date if title and title not in entry.titles: entry.titles.append(title) history.updated_at = datetime.now(timezone.utc).isoformat() path.parent.mkdir(parents=True, exist_ok=True) path.write_text( json.dumps(_published_urls_to_dict(history), ensure_ascii=False, indent=2), encoding="utf-8", ) return history def dry_run_publish(slug: str, base_url: str) -> PublishResult: return PublishResult( mode="dry-run", status="ok", slug=slug, blog_url=f"{base_url.rstrip('/')}/posts/{slug}", public_ok=True, ) def _content_hash(value: str) -> str: return hashlib.sha256((value or "").encode("utf-8")).hexdigest() def _get_existing_post(client: BlogClient, slug: str) -> dict[str, Any] | None: getter = getattr(client, "get_post_by_slug", None) if getter is None: return None existing = getter(slug) return existing if isinstance(existing, dict) else None def publish_markdown( *, title: str, markdown: str, tags: list[str], slug: str, base_url: str, mode: str, markdown_report: dict[str, Any], client: BlogClient | None, idempotency_config: dict[str, Any] | None = None, ) -> PublishResult: blocking_errors = markdown_report.get("blocking_errors", []) or [] blog_url = f"{base_url.rstrip('/')}/posts/{slug}" if blocking_errors: return PublishResult( mode=mode, status="blocked", slug=slug, blog_url=blog_url, public_ok=False, error=";".join(blocking_errors), ) if mode == "dry-run": return dry_run_publish(slug, base_url) if client is None: return PublishResult( mode=mode, status="failed", slug=slug, blog_url=blog_url, public_ok=False, error="missing_blog_client", ) idempotency_config = idempotency_config or {} if bool(idempotency_config.get("enabled", False)): try: existing_post = _get_existing_post(client, slug) except Exception as exc: return PublishResult( mode=mode, status="failed", slug=slug, blog_url=blog_url, public_ok=False, error=f"idempotency_check_failed:{type(exc).__name__}: {exc}", ) if existing_post is not None: existing_content = str(existing_post.get("content") or existing_post.get("markdown") or "") if _content_hash(existing_content) == _content_hash(markdown): return PublishResult( mode=mode, status="already_published", slug=slug, blog_url=blog_url, public_ok=True, ) if not bool(idempotency_config.get("allow_republish", False)): return PublishResult( mode=mode, status="blocked", slug=slug, blog_url=blog_url, public_ok=False, error="slug_already_exists", ) payload = {"title": title, "content": markdown, "tags": tags, "slug": slug} try: create_resp = client.create_post(payload) created_slug = create_resp.get("slug") or slug if mode == "publish": client.publish_post(created_slug) return PublishResult( mode=mode, status="ok", slug=created_slug, blog_url=f"{base_url.rstrip('/')}/posts/{created_slug}", public_ok=mode == "publish", ) except Exception as exc: return PublishResult( mode=mode, status="failed", slug=slug, blog_url=blog_url, public_ok=False, error=f"{type(exc).__name__}: {exc}", )