feat: refactor API key configuration and enhance application initialization
- Renamed `check_environment` to `check_api_key_configured` for clarity, simplifying the API key validation logic. - Removed the blocking behavior of the API key check during application startup, allowing the app to run while providing a prompt for configuration. - Updated `LocalAgentApp` to accept an `api_configured` parameter, enabling conditional messaging for API key setup. - Enhanced the `SandboxRunner` to support backup management and improved execution result handling with detailed metrics. - Integrated data governance strategies into the `HistoryManager`, ensuring compliance and improved data management. - Added privacy settings and metrics tracking across various components to enhance user experience and application safety.
This commit is contained in:
380
history/task_features.py
Normal file
380
history/task_features.py
Normal file
@@ -0,0 +1,380 @@
|
||||
"""
|
||||
任务特征提取与匹配模块
|
||||
用于更精确的相似任务识别
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, List, Set, Optional, Tuple
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@dataclass
|
||||
class TaskFeatures:
|
||||
"""任务结构化特征"""
|
||||
# 基础信息
|
||||
raw_input: str
|
||||
keywords: Set[str]
|
||||
|
||||
# 关键参数
|
||||
file_formats: Set[str] # 文件格式(如 .txt, .csv, .json)
|
||||
directory_paths: Set[str] # 目录路径
|
||||
file_names: Set[str] # 文件名
|
||||
naming_patterns: List[str] # 命名规则(如 "按日期", "按序号")
|
||||
|
||||
# 操作类型
|
||||
operations: Set[str] # 操作类型(如 "批量重命名", "文件转换", "数据处理")
|
||||
|
||||
# 数量/范围参数
|
||||
quantities: List[str] # 数量相关(如 "100个", "所有")
|
||||
|
||||
# 其他约束
|
||||
constraints: List[str] # 其他约束条件
|
||||
|
||||
|
||||
@dataclass
|
||||
class TaskDifference:
|
||||
"""任务差异描述"""
|
||||
category: str # 差异类别
|
||||
field: str # 字段名
|
||||
current_value: str # 当前任务的值
|
||||
history_value: str # 历史任务的值
|
||||
importance: str # 重要性:critical/high/medium/low
|
||||
|
||||
|
||||
class TaskFeatureExtractor:
|
||||
"""任务特征提取器"""
|
||||
|
||||
# 文件格式模式
|
||||
FILE_FORMAT_PATTERN = r'\.(txt|csv|json|xml|xlsx?|docx?|pdf|png|jpe?g|gif|mp[34]|avi|mov|zip|rar|7z|py|js|java|cpp|html?|css)'
|
||||
|
||||
# 目录路径模式(Windows 和 Unix)
|
||||
DIR_PATH_PATTERN = r'(?:[a-zA-Z]:\\[\w\\\s\u4e00-\u9fa5.-]+|/[\w/\s\u4e00-\u9fa5.-]+|[./][\w/\\\s\u4e00-\u9fa5.-]+)'
|
||||
|
||||
# 文件名模式
|
||||
FILE_NAME_PATTERN = r'[\w\u4e00-\u9fa5.-]+\.[a-zA-Z0-9]+'
|
||||
|
||||
# 数量模式
|
||||
QUANTITY_PATTERN = r'(\d+\s*[个张份条篇页行列]|所有|全部|批量)'
|
||||
|
||||
# 操作关键词映射
|
||||
OPERATION_KEYWORDS = {
|
||||
'重命名': ['重命名', '改名', '命名', '更名'],
|
||||
'转换': ['转换', '转为', '转成', '变成', '改成'],
|
||||
'批量处理': ['批量', '批处理', '一次性'],
|
||||
'复制': ['复制', '拷贝', 'copy'],
|
||||
'移动': ['移动', '转移', 'move'],
|
||||
'删除': ['删除', '清理', '移除'],
|
||||
'合并': ['合并', '整合', '汇总'],
|
||||
'分割': ['分割', '拆分', '切分'],
|
||||
'压缩': ['压缩', '打包'],
|
||||
'解压': ['解压', '解包', '提取'],
|
||||
'排序': ['排序', '排列'],
|
||||
'筛选': ['筛选', '过滤', '查找'],
|
||||
'统计': ['统计', '计数', '汇总'],
|
||||
'生成': ['生成', '创建', '制作'],
|
||||
}
|
||||
|
||||
# 命名规则关键词
|
||||
NAMING_PATTERNS = {
|
||||
'按日期': ['日期', '时间', 'date', 'time'],
|
||||
'按序号': ['序号', '编号', '数字', '顺序'],
|
||||
'按前缀': ['前缀', '开头'],
|
||||
'按后缀': ['后缀', '结尾'],
|
||||
'按内容': ['内容', '根据'],
|
||||
}
|
||||
|
||||
def extract(self, user_input: str) -> TaskFeatures:
|
||||
"""
|
||||
从用户输入中提取结构化特征
|
||||
|
||||
Args:
|
||||
user_input: 用户输入文本
|
||||
|
||||
Returns:
|
||||
TaskFeatures: 提取的特征
|
||||
"""
|
||||
# 提取关键词
|
||||
keywords = self._extract_keywords(user_input)
|
||||
|
||||
# 提取文件格式
|
||||
file_formats = self._extract_file_formats(user_input)
|
||||
|
||||
# 提取目录路径
|
||||
directory_paths = self._extract_directory_paths(user_input)
|
||||
|
||||
# 提取文件名
|
||||
file_names = self._extract_file_names(user_input)
|
||||
|
||||
# 提取命名规则
|
||||
naming_patterns = self._extract_naming_patterns(user_input)
|
||||
|
||||
# 提取操作类型
|
||||
operations = self._extract_operations(user_input)
|
||||
|
||||
# 提取数量信息
|
||||
quantities = self._extract_quantities(user_input)
|
||||
|
||||
# 提取其他约束
|
||||
constraints = self._extract_constraints(user_input)
|
||||
|
||||
return TaskFeatures(
|
||||
raw_input=user_input,
|
||||
keywords=keywords,
|
||||
file_formats=file_formats,
|
||||
directory_paths=directory_paths,
|
||||
file_names=file_names,
|
||||
naming_patterns=naming_patterns,
|
||||
operations=operations,
|
||||
quantities=quantities,
|
||||
constraints=constraints
|
||||
)
|
||||
|
||||
def _extract_keywords(self, text: str) -> Set[str]:
|
||||
"""提取关键词(基础分词)"""
|
||||
words = re.findall(r'[\u4e00-\u9fa5]+|[a-zA-Z]+', text.lower())
|
||||
return set(w for w in words if len(w) >= 2)
|
||||
|
||||
def _extract_file_formats(self, text: str) -> Set[str]:
|
||||
"""提取文件格式"""
|
||||
matches = re.findall(self.FILE_FORMAT_PATTERN, text.lower())
|
||||
return set(f'.{m}' for m in matches)
|
||||
|
||||
def _extract_directory_paths(self, text: str) -> Set[str]:
|
||||
"""提取目录路径"""
|
||||
matches = re.findall(self.DIR_PATH_PATTERN, text)
|
||||
# 标准化路径
|
||||
normalized = set()
|
||||
for path in matches:
|
||||
try:
|
||||
p = Path(path)
|
||||
normalized.add(str(p.resolve()))
|
||||
except:
|
||||
normalized.add(path)
|
||||
return normalized
|
||||
|
||||
def _extract_file_names(self, text: str) -> Set[str]:
|
||||
"""提取文件名"""
|
||||
matches = re.findall(self.FILE_NAME_PATTERN, text)
|
||||
return set(matches)
|
||||
|
||||
def _extract_naming_patterns(self, text: str) -> List[str]:
|
||||
"""提取命名规则"""
|
||||
patterns = []
|
||||
for pattern_name, keywords in self.NAMING_PATTERNS.items():
|
||||
if any(kw in text for kw in keywords):
|
||||
patterns.append(pattern_name)
|
||||
return patterns
|
||||
|
||||
def _extract_operations(self, text: str) -> Set[str]:
|
||||
"""提取操作类型"""
|
||||
operations = set()
|
||||
for op_name, keywords in self.OPERATION_KEYWORDS.items():
|
||||
if any(kw in text for kw in keywords):
|
||||
operations.add(op_name)
|
||||
return operations
|
||||
|
||||
def _extract_quantities(self, text: str) -> List[str]:
|
||||
"""提取数量信息"""
|
||||
matches = re.findall(self.QUANTITY_PATTERN, text)
|
||||
return matches
|
||||
|
||||
def _extract_constraints(self, text: str) -> List[str]:
|
||||
"""提取其他约束条件"""
|
||||
constraints = []
|
||||
|
||||
# 条件关键词
|
||||
condition_keywords = ['如果', '当', '满足', '符合', '包含', '不包含', '大于', '小于', '等于']
|
||||
for keyword in condition_keywords:
|
||||
if keyword in text:
|
||||
# 提取包含该关键词的句子片段
|
||||
pattern = f'[^。,;]*{keyword}[^。,;]*'
|
||||
matches = re.findall(pattern, text)
|
||||
constraints.extend(matches)
|
||||
|
||||
return constraints
|
||||
|
||||
|
||||
class TaskMatcher:
|
||||
"""任务匹配器"""
|
||||
|
||||
def __init__(self):
|
||||
self.extractor = TaskFeatureExtractor()
|
||||
|
||||
def calculate_similarity(
|
||||
self,
|
||||
current_input: str,
|
||||
history_input: str
|
||||
) -> Tuple[float, List[TaskDifference]]:
|
||||
"""
|
||||
计算两个任务的相似度,并返回差异列表
|
||||
|
||||
Args:
|
||||
current_input: 当前任务输入
|
||||
history_input: 历史任务输入
|
||||
|
||||
Returns:
|
||||
(相似度分数 0-1, 差异列表)
|
||||
"""
|
||||
# 提取特征
|
||||
current_features = self.extractor.extract(current_input)
|
||||
history_features = self.extractor.extract(history_input)
|
||||
|
||||
# 计算各维度相似度和差异
|
||||
differences = []
|
||||
scores = []
|
||||
|
||||
# 1. 关键词相似度(基础权重 0.2)
|
||||
keyword_sim = self._jaccard_similarity(
|
||||
current_features.keywords,
|
||||
history_features.keywords
|
||||
)
|
||||
scores.append(('keywords', keyword_sim, 0.2))
|
||||
|
||||
# 2. 文件格式相似度(权重 0.15)
|
||||
format_sim, format_diffs = self._compare_sets(
|
||||
current_features.file_formats,
|
||||
history_features.file_formats,
|
||||
'file_formats',
|
||||
'文件格式',
|
||||
'high'
|
||||
)
|
||||
scores.append(('file_formats', format_sim, 0.15))
|
||||
differences.extend(format_diffs)
|
||||
|
||||
# 3. 目录路径相似度(权重 0.15)
|
||||
dir_sim, dir_diffs = self._compare_sets(
|
||||
current_features.directory_paths,
|
||||
history_features.directory_paths,
|
||||
'directory_paths',
|
||||
'目录路径',
|
||||
'critical'
|
||||
)
|
||||
scores.append(('directory_paths', dir_sim, 0.15))
|
||||
differences.extend(dir_diffs)
|
||||
|
||||
# 4. 命名规则相似度(权重 0.15)
|
||||
naming_sim, naming_diffs = self._compare_lists(
|
||||
current_features.naming_patterns,
|
||||
history_features.naming_patterns,
|
||||
'naming_patterns',
|
||||
'命名规则',
|
||||
'high'
|
||||
)
|
||||
scores.append(('naming_patterns', naming_sim, 0.15))
|
||||
differences.extend(naming_diffs)
|
||||
|
||||
# 5. 操作类型相似度(权重 0.2)
|
||||
op_sim, op_diffs = self._compare_sets(
|
||||
current_features.operations,
|
||||
history_features.operations,
|
||||
'operations',
|
||||
'操作类型',
|
||||
'critical'
|
||||
)
|
||||
scores.append(('operations', op_sim, 0.2))
|
||||
differences.extend(op_diffs)
|
||||
|
||||
# 6. 数量信息相似度(权重 0.1)
|
||||
qty_sim, qty_diffs = self._compare_lists(
|
||||
current_features.quantities,
|
||||
history_features.quantities,
|
||||
'quantities',
|
||||
'数量',
|
||||
'medium'
|
||||
)
|
||||
scores.append(('quantities', qty_sim, 0.1))
|
||||
differences.extend(qty_diffs)
|
||||
|
||||
# 7. 约束条件相似度(权重 0.05)
|
||||
constraint_sim, constraint_diffs = self._compare_lists(
|
||||
current_features.constraints,
|
||||
history_features.constraints,
|
||||
'constraints',
|
||||
'约束条件',
|
||||
'medium'
|
||||
)
|
||||
scores.append(('constraints', constraint_sim, 0.05))
|
||||
differences.extend(constraint_diffs)
|
||||
|
||||
# 计算加权总分
|
||||
total_score = sum(score * weight for _, score, weight in scores)
|
||||
|
||||
return total_score, differences
|
||||
|
||||
def _jaccard_similarity(self, set1: Set, set2: Set) -> float:
|
||||
"""计算 Jaccard 相似度"""
|
||||
if not set1 and not set2:
|
||||
return 1.0
|
||||
if not set1 or not set2:
|
||||
return 0.0
|
||||
|
||||
intersection = len(set1 & set2)
|
||||
union = len(set1 | set2)
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
def _compare_sets(
|
||||
self,
|
||||
current: Set[str],
|
||||
history: Set[str],
|
||||
field: str,
|
||||
display_name: str,
|
||||
importance: str
|
||||
) -> Tuple[float, List[TaskDifference]]:
|
||||
"""比较两个集合,返回相似度和差异"""
|
||||
similarity = self._jaccard_similarity(current, history)
|
||||
differences = []
|
||||
|
||||
# 找出差异
|
||||
only_current = current - history
|
||||
only_history = history - current
|
||||
|
||||
if only_current or only_history:
|
||||
differences.append(TaskDifference(
|
||||
category=display_name,
|
||||
field=field,
|
||||
current_value=', '.join(sorted(only_current)) if only_current else '(无)',
|
||||
history_value=', '.join(sorted(only_history)) if only_history else '(无)',
|
||||
importance=importance
|
||||
))
|
||||
|
||||
return similarity, differences
|
||||
|
||||
def _compare_lists(
|
||||
self,
|
||||
current: List[str],
|
||||
history: List[str],
|
||||
field: str,
|
||||
display_name: str,
|
||||
importance: str
|
||||
) -> Tuple[float, List[TaskDifference]]:
|
||||
"""比较两个列表,返回相似度和差异"""
|
||||
# 转为集合计算相似度
|
||||
current_set = set(current)
|
||||
history_set = set(history)
|
||||
similarity = self._jaccard_similarity(current_set, history_set)
|
||||
|
||||
differences = []
|
||||
if current != history:
|
||||
differences.append(TaskDifference(
|
||||
category=display_name,
|
||||
field=field,
|
||||
current_value=', '.join(current) if current else '(无)',
|
||||
history_value=', '.join(history) if history else '(无)',
|
||||
importance=importance
|
||||
))
|
||||
|
||||
return similarity, differences
|
||||
|
||||
|
||||
# 全局单例
|
||||
_matcher: Optional[TaskMatcher] = None
|
||||
|
||||
|
||||
def get_task_matcher() -> TaskMatcher:
|
||||
"""获取任务匹配器单例"""
|
||||
global _matcher
|
||||
if _matcher is None:
|
||||
_matcher = TaskMatcher()
|
||||
return _matcher
|
||||
|
||||
Reference in New Issue
Block a user