LocalAgent/llm/client.py

"""
LLM 统一调用客户端
所有模型通过 SiliconFlow API 调用
支持流式和非流式两种模式
支持自动重试机制
"""

import os
import json
import time
import requests
from pathlib import Path
from typing import Optional, Generator, Callable, List, Dict, Any
from dotenv import load_dotenv

# 获取项目根目录
PROJECT_ROOT = Path(__file__).parent.parent
ENV_PATH = PROJECT_ROOT / ".env"


class LLMClientError(Exception):
    """LLM 客户端异常"""
    pass


class LLMClient:
    """
    统一的 LLM 调用客户端

    使用方式:
        client = LLMClient()

        # 非流式调用
        response = client.chat(
            messages=[{"role": "user", "content": "你好"}],
            model="Qwen/Qwen2.5-7B-Instruct"
        )

        # 流式调用
        for chunk in client.chat_stream(
            messages=[{"role": "user", "content": "你好"}],
            model="Qwen/Qwen2.5-7B-Instruct"
        ):
            print(chunk, end="", flush=True)

    特性:
        - 自动重试：网络错误时自动重试（默认3次）
        - 指数退避：重试间隔逐渐增加
    """

    # 重试配置
    DEFAULT_MAX_RETRIES = 3
    DEFAULT_RETRY_DELAY = 1.0  # 初始重试延迟（秒）
    DEFAULT_RETRY_BACKOFF = 2.0  # 退避倍数

    def __init__(self, max_retries: int = DEFAULT_MAX_RETRIES):
        load_dotenv(ENV_PATH)

        self.api_url = os.getenv("LLM_API_URL")
        self.api_key = os.getenv("LLM_API_KEY")
        self.max_retries = max_retries

        if not self.api_url:
            raise LLMClientError("未配置 LLM_API_URL，请检查 .env 文件")
        if not self.api_key or self.api_key == "your_api_key_here":
            raise LLMClientError("未配置有效的 LLM_API_KEY，请检查 .env 文件")

    def _should_retry(self, exception: Exception) -> bool:
        """判断是否应该重试"""
        # 网络连接错误、超时错误可以重试
        if isinstance(exception, (requests.exceptions.ConnectionError,
                                   requests.exceptions.Timeout)):
            return True
        # 服务器错误（5xx）可以重试
        if isinstance(exception, LLMClientError):
            error_msg = str(exception)
            if "状态码: 5" in error_msg or "502" in error_msg or "503" in error_msg or "504" in error_msg:
                return True
        return False

    def _do_request_with_retry(
        self,
        request_func: Callable,
        operation_name: str = "请求"
    ):
        """带重试的请求执行"""
        last_exception = None

        for attempt in range(self.max_retries + 1):
            try:
                return request_func()
            except Exception as e:
                last_exception = e

                # 判断是否应该重试
                if attempt < self.max_retries and self._should_retry(e):
                    delay = self.DEFAULT_RETRY_DELAY * (self.DEFAULT_RETRY_BACKOFF ** attempt)
                    print(f"[重试] {operation_name}失败，{delay:.1f}秒后重试 ({attempt + 1}/{self.max_retries})...")
                    time.sleep(delay)
                    continue
                else:
                    raise

        # 所有重试都失败
        raise last_exception

    def chat(
        self,
        messages: List[Dict[str, str]],
        model: str,
        temperature: float = 0.7,
        max_tokens: int = 1024,
        timeout: int = 180
    ) -> str:
        """
        调用 LLM 进行对话（非流式，带自动重试）

        Args:
            messages: 消息列表
            model: 模型名称
            temperature: 温度参数
            max_tokens: 最大生成 token 数
            timeout: 超时时间（秒），默认 180 秒

        Returns:
            LLM 生成的文本内容
        """
        def do_request():
            headers = {
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json"
            }

            payload = {
                "model": model,
                "messages": messages,
                "stream": False,
                "temperature": temperature,
                "max_tokens": max_tokens
            }

            try:
                response = requests.post(
                    self.api_url,
                    headers=headers,
                    json=payload,
                    timeout=timeout
                )
            except requests.exceptions.Timeout:
                raise LLMClientError(f"请求超时（{timeout}秒），请检查网络连接或稍后重试")
            except requests.exceptions.ConnectionError:
                raise LLMClientError("网络连接失败，请检查网络设置")
            except requests.exceptions.RequestException as e:
                raise LLMClientError(f"网络请求异常: {str(e)}")

            if response.status_code != 200:
                error_msg = f"API 返回错误 (状态码: {response.status_code})"
                try:
                    error_detail = response.json()
                    if "error" in error_detail:
                        error_msg += f": {error_detail['error']}"
                except:
                    error_msg += f": {response.text[:200]}"
                raise LLMClientError(error_msg)

            try:
                result = response.json()
                content = result["choices"][0]["message"]["content"]
                return content
            except (KeyError, IndexError, TypeError) as e:
                raise LLMClientError(f"解析 API 响应失败: {str(e)}")

        return self._do_request_with_retry(do_request, "LLM调用")

    def chat_stream(
        self,
        messages: List[Dict[str, str]],
        model: str,
        temperature: float = 0.7,
        max_tokens: int = 2048,
        timeout: int = 180
    ) -> Generator[str, None, None]:
        """
        调用 LLM 进行对话（流式，带自动重试）

        Args:
            messages: 消息列表
            model: 模型名称
            temperature: 温度参数
            max_tokens: 最大生成 token 数
            timeout: 超时时间（秒）

        Yields:
            逐个返回生成的文本片段
        """
        def do_request():
            headers = {
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json"
            }

            payload = {
                "model": model,
                "messages": messages,
                "stream": True,
                "temperature": temperature,
                "max_tokens": max_tokens
            }

            try:
                response = requests.post(
                    self.api_url,
                    headers=headers,
                    json=payload,
                    timeout=timeout,
                    stream=True
                )
            except requests.exceptions.Timeout:
                raise LLMClientError(f"请求超时（{timeout}秒），请检查网络连接或稍后重试")
            except requests.exceptions.ConnectionError:
                raise LLMClientError("网络连接失败，请检查网络设置")
            except requests.exceptions.RequestException as e:
                raise LLMClientError(f"网络请求异常: {str(e)}")

            if response.status_code != 200:
                error_msg = f"API 返回错误 (状态码: {response.status_code})"
                try:
                    error_detail = response.json()
                    if "error" in error_detail:
                        error_msg += f": {error_detail['error']}"
                except:
                    error_msg += f": {response.text[:200]}"
                raise LLMClientError(error_msg)

            return response

        # 流式请求的重试只在建立连接阶段
        response = self._do_request_with_retry(do_request, "流式LLM调用")

        # 解析 SSE 流
        for line in response.iter_lines():
            if line:
                line = line.decode('utf-8')
                if line.startswith('data: '):
                    data = line[6:]  # 去掉 "data: " 前缀
                    if data == '[DONE]':
                        break
                    try:
                        chunk = json.loads(data)
                        if 'choices' in chunk and len(chunk['choices']) > 0:
                            delta = chunk['choices'][0].get('delta', {})
                            content = delta.get('content', '')
                            if content:
                                yield content
                    except json.JSONDecodeError:
                        continue

    def chat_stream_collect(
        self,
        messages: List[Dict[str, str]],
        model: str,
        temperature: float = 0.7,
        max_tokens: int = 2048,
        timeout: int = 180,
        on_chunk: Optional[Callable[[str], None]] = None
    ) -> str:
        """
        流式调用并收集完整结果

        Args:
            messages: 消息列表
            model: 模型名称
            temperature: 温度参数
            max_tokens: 最大生成 token 数
            timeout: 超时时间（秒）
            on_chunk: 每收到一个片段时的回调函数

        Returns:
            完整的生成文本
        """
        full_content = []

        for chunk in self.chat_stream(
            messages=messages,
            model=model,
            temperature=temperature,
            max_tokens=max_tokens,
            timeout=timeout
        ):
            full_content.append(chunk)
            if on_chunk:
                on_chunk(chunk)

        return ''.join(full_content)


# 全局单例（延迟初始化）
_client: Optional[LLMClient] = None


def get_client() -> LLMClient:
    """获取 LLM 客户端单例"""
    global _client
    if _client is None:
        _client = LLMClient()
    return _client