#!/usr/bin/env python3 """Render a Markdown research report as a comfortable standalone HTML file.""" from __future__ import annotations import argparse import base64 import html import mimetypes import re import sys from pathlib import Path try: import markdown except ImportError: # pragma: no cover - environment guidance print("Missing dependency: python package 'markdown'. Install it and rerun.", file=sys.stderr) raise MD_IMAGE_RE = re.compile(r"!\[([^\]]*)\]\(([^)\s]+)(?:\s+\"[^\"]*\")?\)") LIST_ITEM_RE = re.compile(r"^\s*(?:[-*+]|\d+[.)])\s+") TOC_HEADING_RE = re.compile(r"^\s{0,3}#{2,6}\s+(?:目录|目錄|contents?|table of contents)\s*$", re.IGNORECASE) TOC_ITEM_RE = re.compile(r"^\s*(?:[-*+]|\d+[.)])\s+\[[^\]]+\]\(#[^)]+\)\s*$") HR_RE = re.compile(r"^\s{0,3}(?:-{3,}|\*{3,}|_{3,})\s*$") MERMAID_BLOCK_RE = re.compile( r'
(.*?)
', re.S, ) MERMAID_CDN = "https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.min.js" def is_external(src: str) -> bool: return bool(re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*:", src)) or src.startswith("//") def embed_images(text: str, base_dir: Path) -> str: def replace(match: re.Match[str]) -> str: alt, src = match.groups() if is_external(src): return match.group(0) image_path = (base_dir / src).resolve() if not image_path.exists(): return match.group(0) mime = mimetypes.guess_type(image_path.name)[0] or "application/octet-stream" encoded = base64.b64encode(image_path.read_bytes()).decode("ascii") return f"![{alt}](data:{mime};base64,{encoded})" return MD_IMAGE_RE.sub(replace, text) def normalize_markdown(text: str) -> str: """Make common report Markdown patterns parse consistently. Many generated reports write "label:" directly followed by a list with no blank line. Python-Markdown treats that as a paragraph plus literal hyphens, so add the blank line that Markdown parsers expect. Skip fenced code blocks. """ text = text.replace("|", "|") lines = text.splitlines() normalized: list[str] = [] in_fence = False for i, line in enumerate(lines): stripped = line.strip() if stripped.startswith("```") or stripped.startswith("~~~"): in_fence = not in_fence if ( not in_fence and LIST_ITEM_RE.match(line) and normalized and normalized[-1].strip() and not LIST_ITEM_RE.match(normalized[-1]) ): normalized.append("") normalized.append(line) next_line = lines[i + 1] if i + 1 < len(lines) else "" if ( not in_fence and LIST_ITEM_RE.match(line) and next_line.strip() and not LIST_ITEM_RE.match(next_line) and not next_line.startswith((" ", "\t")) ): normalized.append("") return "\n".join(normalized) + ("\n" if text.endswith("\n") else "") def strip_inline_toc(text: str) -> str: """Remove a generated Markdown TOC when a side TOC will be rendered.""" lines = text.splitlines() stripped: list[str] = [] i = 0 in_fence = False while i < len(lines): line = lines[i] marker = line.strip() if marker.startswith("```") or marker.startswith("~~~"): in_fence = not in_fence stripped.append(line) i += 1 continue if not in_fence and TOC_HEADING_RE.match(line): j = i + 1 while j < len(lines) and not lines[j].strip(): j += 1 item_count = 0 while j < len(lines) and TOC_ITEM_RE.match(lines[j]): item_count += 1 j += 1 if item_count >= 2: while j < len(lines) and not lines[j].strip(): j += 1 if j < len(lines) and HR_RE.match(lines[j]): j += 1 while j < len(lines) and not lines[j].strip(): j += 1 i = j continue stripped.append(line) i += 1 return "\n".join(stripped) + ("\n" if text.endswith("\n") else "") def title_from_body(body: str) -> str: match = re.search(r"]*>(.*?)", body, re.S) if not match: return "Markdown Report" return re.sub(r"<.*?>", "", match.group(1)).strip() or "Markdown Report" def render_mermaid_blocks(body: str) -> tuple[str, int]: """Convert fenced mermaid code blocks into Mermaid render targets.""" def replace(match: re.Match[str]) -> str: diagram = html.unescape(match.group(1)).strip() return f'
{html.escape(diagram)}
' return MERMAID_BLOCK_RE.subn(replace, body) def build_mermaid_js(source: str) -> str: if source == "none": return "" if source == "local": loader = '' else: loader = f'' return f""" {loader} """ def build_js() -> str: return """ """ def build_html(title: str, toc: str, body: str, with_js: bool, mermaid_source: str = "none") -> str: progress = '
' if with_js else "" back_top = '' if with_js else "" js = build_js() if with_js else "" mermaid_js = build_mermaid_js(mermaid_source) return f""" {title} {progress}
{body}
{back_top} {mermaid_js} {js} """ def main() -> int: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("input", help="Input Markdown file") parser.add_argument("output", nargs="?", help="Output HTML file") parser.add_argument("--embed-images", dest="embed_images", action="store_true", default=True) parser.add_argument("--no-embed-images", dest="embed_images", action="store_false") parser.add_argument("--with-js", action="store_true", help="Add progress, active TOC, and back-to-top interactions") parser.add_argument("--keep-inline-toc", action="store_true", help="Keep an existing Markdown TOC in the article body") parser.add_argument( "--mermaid-source", choices=["auto", "cdn", "local", "none"], default="auto", help="Render mermaid fences with CDN JS, local mermaid.min.js, or disable rendering", ) parser.add_argument("--title-style", choices=["comfortable"], default="comfortable") args = parser.parse_args() source = Path(args.input).expanduser().resolve() if not source.exists(): print(f"Input file not found: {source}", file=sys.stderr) return 2 output = Path(args.output).expanduser().resolve() if args.output else source.with_suffix(".html") text = normalize_markdown(source.read_text(encoding="utf-8")) if not args.keep_inline_toc: text = strip_inline_toc(text) if args.embed_images: text = embed_images(text, source.parent) md = markdown.Markdown( extensions=["extra", "toc", "sane_lists", "smarty"], extension_configs={"toc": {"permalink": False, "separator": "-"}}, ) body = md.convert(text) body = re.sub(r"(.*?
)", r'
\1
', body, flags=re.S) body, mermaid_count = render_mermaid_blocks(body) mermaid_source = "none" if mermaid_count and args.mermaid_source != "none": mermaid_source = "cdn" if args.mermaid_source == "auto" else args.mermaid_source html = build_html(title_from_body(body), md.toc, body, args.with_js, mermaid_source) output.parent.mkdir(parents=True, exist_ok=True) output.write_text(html, encoding="utf-8") print(output) print(f"tables={html.count('