first commit

This commit is contained in:
Hermes Agent
2026-05-10 13:52:46 +08:00
commit ccc63d1e70
4583 changed files with 584341 additions and 0 deletions

View File

@@ -0,0 +1,5 @@
# vlm module - Vision Language Model
from .vlm_adapter import VlmAdapter
__all__ = ["VlmAdapter"]

View File

@@ -0,0 +1,120 @@
"""Image encoding / decoding utilities for VLM."""
from __future__ import annotations
import base64
import io
from pathlib import Path
from PIL import Image
def read_image_bytes(image: str | bytes) -> bytes:
"""Read raw image bytes from a path or return bytes unchanged.
Args:
image: File path to an image, or raw image bytes.
Returns:
bytes: Raw image bytes.
Raises:
FileNotFoundError: If image is a path and the file does not exist.
"""
if isinstance(image, bytes):
return image
path = Path(image)
if not path.is_file():
raise FileNotFoundError(f"Image file not found: {image}")
return path.read_bytes()
def detect_mime(data: bytes) -> str:
"""Infer MIME type from image magic bytes.
Args:
data: Raw image bytes (at least 8 bytes for PNG check).
Returns:
str: 'image/png', 'image/jpeg', or 'image/png' as fallback.
"""
if data[:8] == b"\x89PNG\r\n\x1a\n":
return "image/png"
if data[:3] == b"\xff\xd8\xff":
return "image/jpeg"
return "image/png"
def detect_suffix(data: bytes) -> str:
"""Infer file suffix from image magic bytes.
Args:
data: Raw image bytes.
Returns:
str: '.png', '.jpg', or '.bin' as fallback.
"""
if data[:8] == b"\x89PNG\r\n\x1a\n":
return ".png"
if data[:3] == b"\xff\xd8\xff":
return ".jpg"
return ".bin"
def image_to_mime_and_bytes(image: str | bytes) -> tuple[str, bytes]:
"""Get MIME type and raw bytes; convert to PNG if format is not PNG/JPEG.
Args:
image: File path or raw image bytes.
Returns:
tuple[str, bytes]: (mime_type, raw_bytes). Unknown formats become PNG.
"""
raw = read_image_bytes(image)
mime = detect_mime(raw)
if mime in ("image/png", "image/jpeg"):
return mime, raw
img = Image.open(io.BytesIO(raw)).convert("RGBA")
buf = io.BytesIO()
img.save(buf, format="PNG")
return "image/png", buf.getvalue()
def image_to_base64(image: str | bytes) -> tuple[str, str]:
"""Encode image to MIME type and base64 string.
Args:
image: File path or raw image bytes.
Returns:
tuple[str, str]: (mime_type, base64_encoded_string).
"""
mime, raw = image_to_mime_and_bytes(image)
return mime, base64.b64encode(raw).decode("utf-8")
def image_to_data_url(image: str | bytes) -> str:
"""Build a data URL (data:mime;base64,...) for the image.
Args:
image: File path or raw image bytes.
Returns:
str: Data URL string.
"""
mime, b64 = image_to_base64(image)
return f"data:{mime};base64,{b64}"
def mask_secret(secret: str) -> str:
"""Mask a secret for logging (e.g. show first 6 and last 4 chars).
Args:
secret: Raw secret string.
Returns:
str: Masked string (e.g. 'abcdef...ghij' or all '*' if length <= 8).
"""
if len(secret) <= 8:
return "*" * len(secret)
return f"{secret[:6]}...{secret[-4:]}"

View File

@@ -0,0 +1,55 @@
"""Abstract base class for VLM (Vision Language Model) adapters."""
from __future__ import annotations
from abc import ABC, abstractmethod
class VlmAdapter(ABC):
"""Uniform async interface for a single Vision Language Model backend.
Each concrete adapter wraps one LLM endpoint + model combination and
exposes a single :meth:`vision_completion` coroutine. Synchronous
calling is intentionally **not** supported; callers must run inside an
asyncio event loop.
**Client ownership contract** — when a shared
:class:`httpx.AsyncClient` is supplied at construction time the adapter
*reuses* it and must **not** close it; the caller retains full ownership
of the client's lifecycle. When no external client is provided the
adapter creates and owns an internal client and must close it in
:meth:`aclose`.
"""
@abstractmethod
async def vision_completion(
self,
user_prompt: str,
images: list[str | bytes],
system_prompt: str = "",
model: str | None = None,
) -> str:
"""Send image(s) and a text prompt to the model; return the reply.
Args:
user_prompt: User-facing text instruction.
images: One or more images to pass to the model. Each element
is either a file-path string or raw image bytes.
system_prompt: System-level instruction prepended to the
conversation. Defaults to ''.
model: Model name to use. If None, uses the default set at
initialization.
Returns:
str: Raw text response from the model (may contain JSON or
markdown-wrapped JSON depending on the model and prompt).
"""
@abstractmethod
async def aclose(self) -> None:
"""Release async resources owned by this adapter.
Must be called when the adapter is no longer needed. Adapters that
were given an external shared client must implement this as a no-op;
adapters that created their own internal client must close it here.
"""