Initial commit

2026-01-07 00:17:46 +08:00
commit 4b3286f546
49 changed files with 2492 additions and 0 deletions
--- a/intent/classifier.py
+++ b/intent/classifier.py
@@ -0,0 +1,152 @@
+"""
+意图识别器
+使用小参数 LLM 进行意图二分类
+"""
+
+import os
+import json
+from pathlib import Path
+from typing import Optional
+from dataclasses import dataclass
+from dotenv import load_dotenv
+
+from llm.client import get_client, LLMClientError, ENV_PATH
+from llm.prompts import INTENT_CLASSIFICATION_SYSTEM, INTENT_CLASSIFICATION_USER
+from intent.labels import CHAT, EXECUTION, EXECUTION_CONFIDENCE_THRESHOLD, VALID_LABELS
+
+
+@dataclass
+class IntentResult:
+    """意图识别结果"""
+    label: str  # chat 或 execution
+    confidence: float  # 0.0 ~ 1.0
+    reason: str  # 中文解释
+    raw_response: Optional[str] = None  # 原始 LLM 响应（调试用）
+
+
+class IntentClassifier:
+    """
+    意图分类器
+    
+    使用小参数 LLM（如 qwen2.5:7b-instruct）进行快速意图识别
+    """
+    
+    def __init__(self):
+        load_dotenv(ENV_PATH)
+        self.model_name = os.getenv("INTENT_MODEL_NAME")
+    
+    def classify(self, user_input: str) -> IntentResult:
+        """
+        对用户输入进行意图分类
+        
+        Args:
+            user_input: 用户输入的文本
+            
+        Returns:
+            IntentResult: 包含 label, confidence, reason 的结果
+        """
+        try:
+            client = get_client()
+            
+            messages = [
+                {"role": "system", "content": INTENT_CLASSIFICATION_SYSTEM},
+                {"role": "user", "content": INTENT_CLASSIFICATION_USER.format(user_input=user_input)}
+            ]
+            
+            response = client.chat(
+                messages=messages,
+                model=self.model_name,
+                temperature=0.1,  # 低温度，更确定性的输出
+                max_tokens=256
+            )
+            
+            return self._parse_response(response)
+            
+        except LLMClientError as e:
+            # LLM 调用失败，走兜底逻辑
+            return IntentResult(
+                label=CHAT,
+                confidence=0.0,
+                reason=f"意图识别失败（{str(e)}），默认为对话模式"
+            )
+        except Exception as e:
+            # 其他异常，走兜底逻辑
+            return IntentResult(
+                label=CHAT,
+                confidence=0.0,
+                reason=f"意图识别异常（{str(e)}），默认为对话模式"
+            )
+    
+    def _parse_response(self, response: str) -> IntentResult:
+        """
+        解析 LLM 响应
+        
+        尝试解析 JSON，若失败则走兜底逻辑
+        """
+        try:
+            # 尝试提取 JSON（LLM 可能会在 JSON 前后加一些文字）
+            json_str = self._extract_json(response)
+            data = json.loads(json_str)
+            
+            # 验证必要字段
+            label = data.get("label", "").lower()
+            confidence = float(data.get("confidence", 0.0))
+            reason = data.get("reason", "无")
+            
+            # 验证 label 有效性
+            if label not in VALID_LABELS:
+                return IntentResult(
+                    label=CHAT,
+                    confidence=0.0,
+                    reason=f"无效的意图标签 '{label}'，默认为对话模式",
+                    raw_response=response
+                )
+            
+            # 应用置信度阈值
+            if label == EXECUTION and confidence < EXECUTION_CONFIDENCE_THRESHOLD:
+                return IntentResult(
+                    label=CHAT,
+                    confidence=confidence,
+                    reason=f"执行任务置信度不足（{confidence:.2f} < {EXECUTION_CONFIDENCE_THRESHOLD}），降级为对话模式。原因: {reason}",
+                    raw_response=response
+                )
+            
+            return IntentResult(
+                label=label,
+                confidence=confidence,
+                reason=reason,
+                raw_response=response
+            )
+            
+        except (json.JSONDecodeError, ValueError, TypeError) as e:
+            # JSON 解析失败，走兜底逻辑
+            return IntentResult(
+                label=CHAT,
+                confidence=0.0,
+                reason=f"响应解析失败，默认为对话模式",
+                raw_response=response
+            )
+    
+    def _extract_json(self, text: str) -> str:
+        """
+        从文本中提取 JSON 字符串
+        
+        LLM 可能会在 JSON 前后添加解释文字，需要提取纯 JSON 部分
+        """
+        # 尝试找到 JSON 对象的起止位置
+        start = text.find('{')
+        end = text.rfind('}')
+        
+        if start != -1 and end != -1 and end > start:
+            return text[start:end + 1]
+        
+        # 如果找不到，返回原文本让 json.loads 报错
+        return text
+
+
+# 便捷函数
+def classify_intent(user_input: str) -> IntentResult:
+    """快速进行意图分类"""
+    classifier = IntentClassifier()
+    return classifier.classify(user_input)
+