feat: refactor API key configuration and enhance application initialization

- Renamed `check_environment` to `check_api_key_configured` for clarity, simplifying the API key validation logic. - Removed the blocking behavior of the API key check during application startup, allowing the app to run while providing a prompt for configuration. - Updated `LocalAgentApp` to accept an `api_configured` parameter, enabling conditional messaging for API key setup. - Enhanced the `SandboxRunner` to support backup management and improved execution result handling with detailed metrics. - Integrated data governance strategies into the `HistoryManager`, ensuring compliance and improved data management. - Added privacy settings and metrics tracking across various components to enhance user experience and application safety.
2026-02-27 14:32:30 +08:00
parent ab5bbff6f7
commit 8a538bb950
58 changed files with 13457 additions and 350 deletions
--- a/history/task_features.py
+++ b/history/task_features.py
@@ -0,0 +1,380 @@
+"""
+任务特征提取与匹配模块
+用于更精确的相似任务识别
+"""
+
+import re
+from typing import Dict, List, Set, Optional, Tuple
+from dataclasses import dataclass
+from pathlib import Path
+
+
+@dataclass
+class TaskFeatures:
+    """任务结构化特征"""
+    # 基础信息
+    raw_input: str
+    keywords: Set[str]
+    
+    # 关键参数
+    file_formats: Set[str]  # 文件格式（如 .txt, .csv, .json）
+    directory_paths: Set[str]  # 目录路径
+    file_names: Set[str]  # 文件名
+    naming_patterns: List[str]  # 命名规则（如 "按日期", "按序号"）
+    
+    # 操作类型
+    operations: Set[str]  # 操作类型（如 "批量重命名", "文件转换", "数据处理"）
+    
+    # 数量/范围参数
+    quantities: List[str]  # 数量相关（如 "100个", "所有"）
+    
+    # 其他约束
+    constraints: List[str]  # 其他约束条件
+
+
+@dataclass
+class TaskDifference:
+    """任务差异描述"""
+    category: str  # 差异类别
+    field: str  # 字段名
+    current_value: str  # 当前任务的值
+    history_value: str  # 历史任务的值
+    importance: str  # 重要性：critical/high/medium/low
+
+
+class TaskFeatureExtractor:
+    """任务特征提取器"""
+    
+    # 文件格式模式
+    FILE_FORMAT_PATTERN = r'\.(txt|csv|json|xml|xlsx?|docx?|pdf|png|jpe?g|gif|mp[34]|avi|mov|zip|rar|7z|py|js|java|cpp|html?|css)'
+    
+    # 目录路径模式（Windows 和 Unix）
+    DIR_PATH_PATTERN = r'(?:[a-zA-Z]:\\[\w\\\s\u4e00-\u9fa5.-]+|/[\w/\s\u4e00-\u9fa5.-]+|[./][\w/\\\s\u4e00-\u9fa5.-]+)'
+    
+    # 文件名模式
+    FILE_NAME_PATTERN = r'[\w\u4e00-\u9fa5.-]+\.[a-zA-Z0-9]+'
+    
+    # 数量模式
+    QUANTITY_PATTERN = r'(\d+\s*[个张份条篇页行列]|所有|全部|批量)'
+    
+    # 操作关键词映射
+    OPERATION_KEYWORDS = {
+        '重命名': ['重命名', '改名', '命名', '更名'],
+        '转换': ['转换', '转为', '转成', '变成', '改成'],
+        '批量处理': ['批量', '批处理', '一次性'],
+        '复制': ['复制', '拷贝', 'copy'],
+        '移动': ['移动', '转移', 'move'],
+        '删除': ['删除', '清理', '移除'],
+        '合并': ['合并', '整合', '汇总'],
+        '分割': ['分割', '拆分', '切分'],
+        '压缩': ['压缩', '打包'],
+        '解压': ['解压', '解包', '提取'],
+        '排序': ['排序', '排列'],
+        '筛选': ['筛选', '过滤', '查找'],
+        '统计': ['统计', '计数', '汇总'],
+        '生成': ['生成', '创建', '制作'],
+    }
+    
+    # 命名规则关键词
+    NAMING_PATTERNS = {
+        '按日期': ['日期', '时间', 'date', 'time'],
+        '按序号': ['序号', '编号', '数字', '顺序'],
+        '按前缀': ['前缀', '开头'],
+        '按后缀': ['后缀', '结尾'],
+        '按内容': ['内容', '根据'],
+    }
+    
+    def extract(self, user_input: str) -> TaskFeatures:
+        """
+        从用户输入中提取结构化特征
+        
+        Args:
+            user_input: 用户输入文本
+            
+        Returns:
+            TaskFeatures: 提取的特征
+        """
+        # 提取关键词
+        keywords = self._extract_keywords(user_input)
+        
+        # 提取文件格式
+        file_formats = self._extract_file_formats(user_input)
+        
+        # 提取目录路径
+        directory_paths = self._extract_directory_paths(user_input)
+        
+        # 提取文件名
+        file_names = self._extract_file_names(user_input)
+        
+        # 提取命名规则
+        naming_patterns = self._extract_naming_patterns(user_input)
+        
+        # 提取操作类型
+        operations = self._extract_operations(user_input)
+        
+        # 提取数量信息
+        quantities = self._extract_quantities(user_input)
+        
+        # 提取其他约束
+        constraints = self._extract_constraints(user_input)
+        
+        return TaskFeatures(
+            raw_input=user_input,
+            keywords=keywords,
+            file_formats=file_formats,
+            directory_paths=directory_paths,
+            file_names=file_names,
+            naming_patterns=naming_patterns,
+            operations=operations,
+            quantities=quantities,
+            constraints=constraints
+        )
+    
+    def _extract_keywords(self, text: str) -> Set[str]:
+        """提取关键词（基础分词）"""
+        words = re.findall(r'[\u4e00-\u9fa5]+|[a-zA-Z]+', text.lower())
+        return set(w for w in words if len(w) >= 2)
+    
+    def _extract_file_formats(self, text: str) -> Set[str]:
+        """提取文件格式"""
+        matches = re.findall(self.FILE_FORMAT_PATTERN, text.lower())
+        return set(f'.{m}' for m in matches)
+    
+    def _extract_directory_paths(self, text: str) -> Set[str]:
+        """提取目录路径"""
+        matches = re.findall(self.DIR_PATH_PATTERN, text)
+        # 标准化路径
+        normalized = set()
+        for path in matches:
+            try:
+                p = Path(path)
+                normalized.add(str(p.resolve()))
+            except:
+                normalized.add(path)
+        return normalized
+    
+    def _extract_file_names(self, text: str) -> Set[str]:
+        """提取文件名"""
+        matches = re.findall(self.FILE_NAME_PATTERN, text)
+        return set(matches)
+    
+    def _extract_naming_patterns(self, text: str) -> List[str]:
+        """提取命名规则"""
+        patterns = []
+        for pattern_name, keywords in self.NAMING_PATTERNS.items():
+            if any(kw in text for kw in keywords):
+                patterns.append(pattern_name)
+        return patterns
+    
+    def _extract_operations(self, text: str) -> Set[str]:
+        """提取操作类型"""
+        operations = set()
+        for op_name, keywords in self.OPERATION_KEYWORDS.items():
+            if any(kw in text for kw in keywords):
+                operations.add(op_name)
+        return operations
+    
+    def _extract_quantities(self, text: str) -> List[str]:
+        """提取数量信息"""
+        matches = re.findall(self.QUANTITY_PATTERN, text)
+        return matches
+    
+    def _extract_constraints(self, text: str) -> List[str]:
+        """提取其他约束条件"""
+        constraints = []
+        
+        # 条件关键词
+        condition_keywords = ['如果', '当', '满足', '符合', '包含', '不包含', '大于', '小于', '等于']
+        for keyword in condition_keywords:
+            if keyword in text:
+                # 提取包含该关键词的句子片段
+                pattern = f'[^。，；]*{keyword}[^。，；]*'
+                matches = re.findall(pattern, text)
+                constraints.extend(matches)
+        
+        return constraints
+
+
+class TaskMatcher:
+    """任务匹配器"""
+    
+    def __init__(self):
+        self.extractor = TaskFeatureExtractor()
+    
+    def calculate_similarity(
+        self, 
+        current_input: str, 
+        history_input: str
+    ) -> Tuple[float, List[TaskDifference]]:
+        """
+        计算两个任务的相似度，并返回差异列表
+        
+        Args:
+            current_input: 当前任务输入
+            history_input: 历史任务输入
+            
+        Returns:
+            (相似度分数 0-1, 差异列表)
+        """
+        # 提取特征
+        current_features = self.extractor.extract(current_input)
+        history_features = self.extractor.extract(history_input)
+        
+        # 计算各维度相似度和差异
+        differences = []
+        scores = []
+        
+        # 1. 关键词相似度（基础权重 0.2）
+        keyword_sim = self._jaccard_similarity(
+            current_features.keywords, 
+            history_features.keywords
+        )
+        scores.append(('keywords', keyword_sim, 0.2))
+        
+        # 2. 文件格式相似度（权重 0.15）
+        format_sim, format_diffs = self._compare_sets(
+            current_features.file_formats,
+            history_features.file_formats,
+            'file_formats',
+            '文件格式',
+            'high'
+        )
+        scores.append(('file_formats', format_sim, 0.15))
+        differences.extend(format_diffs)
+        
+        # 3. 目录路径相似度（权重 0.15）
+        dir_sim, dir_diffs = self._compare_sets(
+            current_features.directory_paths,
+            history_features.directory_paths,
+            'directory_paths',
+            '目录路径',
+            'critical'
+        )
+        scores.append(('directory_paths', dir_sim, 0.15))
+        differences.extend(dir_diffs)
+        
+        # 4. 命名规则相似度（权重 0.15）
+        naming_sim, naming_diffs = self._compare_lists(
+            current_features.naming_patterns,
+            history_features.naming_patterns,
+            'naming_patterns',
+            '命名规则',
+            'high'
+        )
+        scores.append(('naming_patterns', naming_sim, 0.15))
+        differences.extend(naming_diffs)
+        
+        # 5. 操作类型相似度（权重 0.2）
+        op_sim, op_diffs = self._compare_sets(
+            current_features.operations,
+            history_features.operations,
+            'operations',
+            '操作类型',
+            'critical'
+        )
+        scores.append(('operations', op_sim, 0.2))
+        differences.extend(op_diffs)
+        
+        # 6. 数量信息相似度（权重 0.1）
+        qty_sim, qty_diffs = self._compare_lists(
+            current_features.quantities,
+            history_features.quantities,
+            'quantities',
+            '数量',
+            'medium'
+        )
+        scores.append(('quantities', qty_sim, 0.1))
+        differences.extend(qty_diffs)
+        
+        # 7. 约束条件相似度（权重 0.05）
+        constraint_sim, constraint_diffs = self._compare_lists(
+            current_features.constraints,
+            history_features.constraints,
+            'constraints',
+            '约束条件',
+            'medium'
+        )
+        scores.append(('constraints', constraint_sim, 0.05))
+        differences.extend(constraint_diffs)
+        
+        # 计算加权总分
+        total_score = sum(score * weight for _, score, weight in scores)
+        
+        return total_score, differences
+    
+    def _jaccard_similarity(self, set1: Set, set2: Set) -> float:
+        """计算 Jaccard 相似度"""
+        if not set1 and not set2:
+            return 1.0
+        if not set1 or not set2:
+            return 0.0
+        
+        intersection = len(set1 & set2)
+        union = len(set1 | set2)
+        return intersection / union if union > 0 else 0.0
+    
+    def _compare_sets(
+        self, 
+        current: Set[str], 
+        history: Set[str],
+        field: str,
+        display_name: str,
+        importance: str
+    ) -> Tuple[float, List[TaskDifference]]:
+        """比较两个集合，返回相似度和差异"""
+        similarity = self._jaccard_similarity(current, history)
+        differences = []
+        
+        # 找出差异
+        only_current = current - history
+        only_history = history - current
+        
+        if only_current or only_history:
+            differences.append(TaskDifference(
+                category=display_name,
+                field=field,
+                current_value=', '.join(sorted(only_current)) if only_current else '(无)',
+                history_value=', '.join(sorted(only_history)) if only_history else '(无)',
+                importance=importance
+            ))
+        
+        return similarity, differences
+    
+    def _compare_lists(
+        self,
+        current: List[str],
+        history: List[str],
+        field: str,
+        display_name: str,
+        importance: str
+    ) -> Tuple[float, List[TaskDifference]]:
+        """比较两个列表，返回相似度和差异"""
+        # 转为集合计算相似度
+        current_set = set(current)
+        history_set = set(history)
+        similarity = self._jaccard_similarity(current_set, history_set)
+        
+        differences = []
+        if current != history:
+            differences.append(TaskDifference(
+                category=display_name,
+                field=field,
+                current_value=', '.join(current) if current else '(无)',
+                history_value=', '.join(history) if history else '(无)',
+                importance=importance
+            ))
+        
+        return similarity, differences
+
+
+# 全局单例
+_matcher: Optional[TaskMatcher] = None
+
+
+def get_task_matcher() -> TaskMatcher:
+    """获取任务匹配器单例"""
+    global _matcher
+    if _matcher is None:
+        _matcher = TaskMatcher()
+    return _matcher
+