#!/usr/bin/env python3 """Render a Markdown research report as a comfortable standalone HTML file.""" from __future__ import annotations import argparse import base64 import html import mimetypes import re import sys from pathlib import Path try: import markdown except ImportError: # pragma: no cover - environment guidance print("Missing dependency: python package 'markdown'. Install it and rerun.", file=sys.stderr) raise MD_IMAGE_RE = re.compile(r"!\[([^\]]*)\]\(([^)\s]+)(?:\s+\"[^\"]*\")?\)") LIST_ITEM_RE = re.compile(r"^\s*(?:[-*+]|\d+[.)])\s+") TOC_HEADING_RE = re.compile(r"^\s{0,3}#{2,6}\s+(?:目录|目錄|contents?|table of contents)\s*$", re.IGNORECASE) TOC_ITEM_RE = re.compile(r"^\s*(?:[-*+]|\d+[.)])\s+\[[^\]]+\]\(#[^)]+\)\s*$") HR_RE = re.compile(r"^\s{0,3}(?:-{3,}|\*{3,}|_{3,})\s*$") MERMAID_BLOCK_RE = re.compile( r'
(.*?)',
re.S,
)
MERMAID_CDN = "https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.min.js"
def is_external(src: str) -> bool:
return bool(re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*:", src)) or src.startswith("//")
def embed_images(text: str, base_dir: Path) -> str:
def replace(match: re.Match[str]) -> str:
alt, src = match.groups()
if is_external(src):
return match.group(0)
image_path = (base_dir / src).resolve()
if not image_path.exists():
return match.group(0)
mime = mimetypes.guess_type(image_path.name)[0] or "application/octet-stream"
encoded = base64.b64encode(image_path.read_bytes()).decode("ascii")
return f""
return MD_IMAGE_RE.sub(replace, text)
def normalize_markdown(text: str) -> str:
"""Make common report Markdown patterns parse consistently.
Many generated reports write "label:" directly followed by a list with no
blank line. Python-Markdown treats that as a paragraph plus literal hyphens,
so add the blank line that Markdown parsers expect. Skip fenced code blocks.
"""
text = text.replace("|", "|")
lines = text.splitlines()
normalized: list[str] = []
in_fence = False
for i, line in enumerate(lines):
stripped = line.strip()
if stripped.startswith("```") or stripped.startswith("~~~"):
in_fence = not in_fence
if (
not in_fence
and LIST_ITEM_RE.match(line)
and normalized
and normalized[-1].strip()
and not LIST_ITEM_RE.match(normalized[-1])
):
normalized.append("")
normalized.append(line)
next_line = lines[i + 1] if i + 1 < len(lines) else ""
if (
not in_fence
and LIST_ITEM_RE.match(line)
and next_line.strip()
and not LIST_ITEM_RE.match(next_line)
and not next_line.startswith((" ", "\t"))
):
normalized.append("")
return "\n".join(normalized) + ("\n" if text.endswith("\n") else "")
def strip_inline_toc(text: str) -> str:
"""Remove a generated Markdown TOC when a side TOC will be rendered."""
lines = text.splitlines()
stripped: list[str] = []
i = 0
in_fence = False
while i < len(lines):
line = lines[i]
marker = line.strip()
if marker.startswith("```") or marker.startswith("~~~"):
in_fence = not in_fence
stripped.append(line)
i += 1
continue
if not in_fence and TOC_HEADING_RE.match(line):
j = i + 1
while j < len(lines) and not lines[j].strip():
j += 1
item_count = 0
while j < len(lines) and TOC_ITEM_RE.match(lines[j]):
item_count += 1
j += 1
if item_count >= 2:
while j < len(lines) and not lines[j].strip():
j += 1
if j < len(lines) and HR_RE.match(lines[j]):
j += 1
while j < len(lines) and not lines[j].strip():
j += 1
i = j
continue
stripped.append(line)
i += 1
return "\n".join(stripped) + ("\n" if text.endswith("\n") else "")
def title_from_body(body: str) -> str:
match = re.search(r"