feat: refactor API key configuration and enhance application initialization
- Renamed `check_environment` to `check_api_key_configured` for clarity, simplifying the API key validation logic. - Removed the blocking behavior of the API key check during application startup, allowing the app to run while providing a prompt for configuration. - Updated `LocalAgentApp` to accept an `api_configured` parameter, enabling conditional messaging for API key setup. - Enhanced the `SandboxRunner` to support backup management and improved execution result handling with detailed metrics. - Integrated data governance strategies into the `HistoryManager`, ensuring compliance and improved data management. - Added privacy settings and metrics tracking across various components to enhance user experience and application safety.
This commit is contained in:
410
history/data_governance.py
Normal file
410
history/data_governance.py
Normal file
@@ -0,0 +1,410 @@
|
||||
"""
|
||||
数据治理策略模块
|
||||
实现数据分级保存、保留期管理、归档和清理策略
|
||||
"""
|
||||
|
||||
import json
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Set
|
||||
from dataclasses import dataclass, asdict
|
||||
from enum import Enum
|
||||
|
||||
from history.data_sanitizer import get_sanitizer, SensitiveType
|
||||
|
||||
|
||||
class DataLevel(Enum):
|
||||
"""数据保存级别"""
|
||||
FULL = "full" # 完整保存(无脱敏)
|
||||
SANITIZED = "sanitized" # 脱敏保存
|
||||
MINIMAL = "minimal" # 最小化保存(仅元数据)
|
||||
ARCHIVED = "archived" # 已归档
|
||||
|
||||
|
||||
class RetentionPolicy(Enum):
|
||||
"""数据保留策略"""
|
||||
SHORT = 7 # 7天
|
||||
MEDIUM = 30 # 30天
|
||||
LONG = 90 # 90天
|
||||
PERMANENT = -1 # 永久保留
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataClassification:
|
||||
"""数据分类结果"""
|
||||
level: DataLevel
|
||||
retention_days: int
|
||||
sensitivity_score: float
|
||||
sensitive_fields: Set[str]
|
||||
reason: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class GovernanceMetrics:
|
||||
"""治理度量指标"""
|
||||
total_records: int
|
||||
full_records: int
|
||||
sanitized_records: int
|
||||
minimal_records: int
|
||||
archived_records: int
|
||||
total_size_bytes: int
|
||||
sensitive_field_hits: Dict[str, int]
|
||||
expired_records: int
|
||||
last_cleanup_time: str
|
||||
|
||||
|
||||
class DataGovernancePolicy:
|
||||
"""
|
||||
数据治理策略
|
||||
|
||||
根据敏感度自动分级保存,管理数据生命周期
|
||||
"""
|
||||
|
||||
# 字段敏感度配置
|
||||
FIELD_SENSITIVITY = {
|
||||
'user_input': 0.5, # 用户输入可能含敏感信息
|
||||
'code': 0.7, # 代码可能含路径、密钥
|
||||
'stdout': 0.6, # 输出可能含敏感数据
|
||||
'stderr': 0.6, # 错误信息可能含路径
|
||||
'execution_plan': 0.3, # 执行计划相对安全
|
||||
'log_path': 0.4, # 日志路径
|
||||
}
|
||||
|
||||
# 分级阈值
|
||||
LEVEL_THRESHOLDS = {
|
||||
DataLevel.FULL: 0.0, # 敏感度 < 0.3 完整保存
|
||||
DataLevel.SANITIZED: 0.3, # 0.3 <= 敏感度 < 0.7 脱敏保存
|
||||
DataLevel.MINIMAL: 0.7, # 敏感度 >= 0.7 最小化保存
|
||||
}
|
||||
|
||||
# 保留期配置(根据数据级别)
|
||||
RETENTION_CONFIG = {
|
||||
DataLevel.FULL: RetentionPolicy.LONG.value, # 完整数据保留90天
|
||||
DataLevel.SANITIZED: RetentionPolicy.MEDIUM.value, # 脱敏数据保留30天
|
||||
DataLevel.MINIMAL: RetentionPolicy.SHORT.value, # 最小化数据保留7天
|
||||
}
|
||||
|
||||
def __init__(self, workspace_path: Path):
|
||||
self.workspace = workspace_path
|
||||
self.sanitizer = get_sanitizer()
|
||||
self.metrics_file = workspace_path / "governance_metrics.json"
|
||||
self.archive_dir = workspace_path / "archive"
|
||||
self.archive_dir.mkdir(exist_ok=True)
|
||||
|
||||
def classify_record(self, record_data: Dict) -> DataClassification:
|
||||
"""
|
||||
对记录进行分类
|
||||
|
||||
Args:
|
||||
record_data: 记录数据字典
|
||||
|
||||
Returns:
|
||||
数据分类结果
|
||||
"""
|
||||
sensitive_fields = set()
|
||||
total_sensitivity = 0.0
|
||||
field_count = 0
|
||||
|
||||
# 分析各字段敏感度
|
||||
for field, weight in self.FIELD_SENSITIVITY.items():
|
||||
if field in record_data and record_data[field]:
|
||||
content = str(record_data[field])
|
||||
field_score = self.sanitizer.get_sensitivity_score(content)
|
||||
|
||||
if field_score > 0.3: # 发现敏感信息
|
||||
sensitive_fields.add(field)
|
||||
|
||||
total_sensitivity += field_score * weight
|
||||
field_count += 1
|
||||
|
||||
# 计算综合敏感度
|
||||
avg_sensitivity = total_sensitivity / field_count if field_count > 0 else 0.0
|
||||
|
||||
# 确定数据级别
|
||||
if avg_sensitivity >= self.LEVEL_THRESHOLDS[DataLevel.MINIMAL]:
|
||||
level = DataLevel.MINIMAL
|
||||
reason = f"高敏感度({avg_sensitivity:.2f}),仅保留元数据"
|
||||
elif avg_sensitivity >= self.LEVEL_THRESHOLDS[DataLevel.SANITIZED]:
|
||||
level = DataLevel.SANITIZED
|
||||
reason = f"中等敏感度({avg_sensitivity:.2f}),脱敏保存"
|
||||
else:
|
||||
level = DataLevel.FULL
|
||||
reason = f"低敏感度({avg_sensitivity:.2f}),完整保存"
|
||||
|
||||
# 确定保留期
|
||||
retention_days = self.RETENTION_CONFIG[level]
|
||||
|
||||
return DataClassification(
|
||||
level=level,
|
||||
retention_days=retention_days,
|
||||
sensitivity_score=avg_sensitivity,
|
||||
sensitive_fields=sensitive_fields,
|
||||
reason=reason
|
||||
)
|
||||
|
||||
def apply_policy(self, record_data: Dict) -> Dict:
|
||||
"""
|
||||
应用治理策略,返回处理后的数据
|
||||
|
||||
Args:
|
||||
record_data: 原始记录数据
|
||||
|
||||
Returns:
|
||||
处理后的记录数据
|
||||
"""
|
||||
classification = self.classify_record(record_data)
|
||||
|
||||
# 添加治理元数据
|
||||
result = record_data.copy()
|
||||
result['_governance'] = {
|
||||
'level': classification.level.value,
|
||||
'retention_days': classification.retention_days,
|
||||
'sensitivity_score': classification.sensitivity_score,
|
||||
'sensitive_fields': list(classification.sensitive_fields),
|
||||
'classified_at': datetime.now().isoformat(),
|
||||
'expires_at': (datetime.now() + timedelta(days=classification.retention_days)).isoformat()
|
||||
}
|
||||
|
||||
# 根据级别处理数据
|
||||
if classification.level == DataLevel.MINIMAL:
|
||||
# 最小化:只保留元数据
|
||||
result = self._minimize_record(result)
|
||||
|
||||
elif classification.level == DataLevel.SANITIZED:
|
||||
# 脱敏:对敏感字段脱敏
|
||||
result = self._sanitize_record(result, classification.sensitive_fields)
|
||||
|
||||
# FULL 级别不做处理
|
||||
|
||||
return result
|
||||
|
||||
def _minimize_record(self, record: Dict) -> Dict:
|
||||
"""
|
||||
最小化记录(仅保留元数据)
|
||||
|
||||
Args:
|
||||
record: 原始记录
|
||||
|
||||
Returns:
|
||||
最小化后的记录
|
||||
"""
|
||||
# 保留的字段
|
||||
keep_fields = {
|
||||
'task_id', 'timestamp', 'intent_label', 'intent_confidence',
|
||||
'success', 'duration_ms', 'task_summary', '_governance'
|
||||
}
|
||||
|
||||
minimal = {k: v for k, v in record.items() if k in keep_fields}
|
||||
|
||||
# 添加摘要信息
|
||||
minimal['user_input'] = '[已删除-高敏感]'
|
||||
minimal['code'] = '[已删除-高敏感]'
|
||||
minimal['stdout'] = '[已删除-高敏感]'
|
||||
minimal['stderr'] = '[已删除-高敏感]'
|
||||
minimal['execution_plan'] = record.get('execution_plan', '')[:100] + '...'
|
||||
|
||||
return minimal
|
||||
|
||||
def _sanitize_record(self, record: Dict, sensitive_fields: Set[str]) -> Dict:
|
||||
"""
|
||||
脱敏记录
|
||||
|
||||
Args:
|
||||
record: 原始记录
|
||||
sensitive_fields: 需要脱敏的字段
|
||||
|
||||
Returns:
|
||||
脱敏后的记录
|
||||
"""
|
||||
result = record.copy()
|
||||
|
||||
for field in sensitive_fields:
|
||||
if field in result and result[field]:
|
||||
content = str(result[field])
|
||||
sanitized, matches = self.sanitizer.sanitize(content)
|
||||
result[field] = sanitized
|
||||
|
||||
# 记录脱敏信息
|
||||
if '_sanitization' not in result:
|
||||
result['_sanitization'] = {}
|
||||
result['_sanitization'][field] = {
|
||||
'masked_count': len(matches),
|
||||
'types': list(set(m.type.value for m in matches))
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
def check_expiration(self, record: Dict) -> bool:
|
||||
"""
|
||||
检查记录是否过期
|
||||
|
||||
Args:
|
||||
record: 记录数据
|
||||
|
||||
Returns:
|
||||
是否过期
|
||||
"""
|
||||
if '_governance' not in record or record['_governance'] is None:
|
||||
return False
|
||||
|
||||
expires_at = record['_governance'].get('expires_at')
|
||||
if not expires_at:
|
||||
return False
|
||||
|
||||
try:
|
||||
expire_time = datetime.fromisoformat(expires_at)
|
||||
return datetime.now() > expire_time
|
||||
except (ValueError, TypeError):
|
||||
return False
|
||||
|
||||
def archive_record(self, record: Dict) -> Path:
|
||||
"""
|
||||
归档记录
|
||||
|
||||
Args:
|
||||
record: 记录数据
|
||||
|
||||
Returns:
|
||||
归档文件路径
|
||||
"""
|
||||
task_id = record.get('task_id', 'unknown')
|
||||
timestamp = record.get('timestamp', datetime.now().strftime('%Y%m%d_%H%M%S'))
|
||||
|
||||
# 生成归档文件名
|
||||
archive_file = self.archive_dir / f"{task_id}_{timestamp}.json"
|
||||
|
||||
# 标记为已归档
|
||||
record['_governance']['level'] = DataLevel.ARCHIVED.value
|
||||
record['_governance']['archived_at'] = datetime.now().isoformat()
|
||||
|
||||
# 保存到归档目录
|
||||
with open(archive_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(record, f, ensure_ascii=False, indent=2)
|
||||
|
||||
return archive_file
|
||||
|
||||
def cleanup_expired(self, records: List[Dict]) -> tuple[List[Dict], int, int]:
|
||||
"""
|
||||
清理过期记录
|
||||
|
||||
Args:
|
||||
records: 记录列表
|
||||
|
||||
Returns:
|
||||
(保留的记录列表, 归档数量, 删除数量)
|
||||
"""
|
||||
kept_records = []
|
||||
archived_count = 0
|
||||
deleted_count = 0
|
||||
|
||||
for record in records:
|
||||
if not self.check_expiration(record):
|
||||
kept_records.append(record)
|
||||
continue
|
||||
|
||||
# 过期处理
|
||||
level = record.get('_governance', {}).get('level')
|
||||
|
||||
if level == DataLevel.FULL.value:
|
||||
# 完整数据:降级为脱敏
|
||||
record['_governance']['level'] = DataLevel.SANITIZED.value
|
||||
record['_governance']['retention_days'] = RetentionPolicy.MEDIUM.value
|
||||
record['_governance']['expires_at'] = (
|
||||
datetime.now() + timedelta(days=RetentionPolicy.MEDIUM.value)
|
||||
).isoformat()
|
||||
|
||||
# 执行脱敏
|
||||
sensitive_fields = set(record['_governance'].get('sensitive_fields', []))
|
||||
record = self._sanitize_record(record, sensitive_fields)
|
||||
kept_records.append(record)
|
||||
|
||||
elif level == DataLevel.SANITIZED.value:
|
||||
# 脱敏数据:归档
|
||||
self.archive_record(record)
|
||||
archived_count += 1
|
||||
|
||||
else:
|
||||
# 最小化数据:直接删除
|
||||
deleted_count += 1
|
||||
|
||||
return kept_records, archived_count, deleted_count
|
||||
|
||||
def collect_metrics(self, records: List[Dict]) -> GovernanceMetrics:
|
||||
"""
|
||||
收集治理度量指标
|
||||
|
||||
Args:
|
||||
records: 记录列表
|
||||
|
||||
Returns:
|
||||
度量指标
|
||||
"""
|
||||
metrics = GovernanceMetrics(
|
||||
total_records=len(records),
|
||||
full_records=0,
|
||||
sanitized_records=0,
|
||||
minimal_records=0,
|
||||
archived_records=0,
|
||||
total_size_bytes=0,
|
||||
sensitive_field_hits={},
|
||||
expired_records=0,
|
||||
last_cleanup_time=datetime.now().isoformat()
|
||||
)
|
||||
|
||||
for record in records:
|
||||
# 统计数据级别
|
||||
level = record.get('_governance', {}).get('level')
|
||||
if level == DataLevel.FULL.value:
|
||||
metrics.full_records += 1
|
||||
elif level == DataLevel.SANITIZED.value:
|
||||
metrics.sanitized_records += 1
|
||||
elif level == DataLevel.MINIMAL.value:
|
||||
metrics.minimal_records += 1
|
||||
elif level == DataLevel.ARCHIVED.value:
|
||||
metrics.archived_records += 1
|
||||
|
||||
# 统计敏感字段命中
|
||||
sensitive_fields = record.get('_governance', {}).get('sensitive_fields', [])
|
||||
for field in sensitive_fields:
|
||||
metrics.sensitive_field_hits[field] = metrics.sensitive_field_hits.get(field, 0) + 1
|
||||
|
||||
# 统计过期记录
|
||||
if self.check_expiration(record):
|
||||
metrics.expired_records += 1
|
||||
|
||||
# 估算大小
|
||||
metrics.total_size_bytes += len(json.dumps(record, ensure_ascii=False))
|
||||
|
||||
return metrics
|
||||
|
||||
def save_metrics(self, metrics: GovernanceMetrics):
|
||||
"""保存度量指标"""
|
||||
with open(self.metrics_file, 'w', encoding='utf-8') as f:
|
||||
data = asdict(metrics)
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
def load_metrics(self) -> Optional[GovernanceMetrics]:
|
||||
"""加载度量指标"""
|
||||
if not self.metrics_file.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(self.metrics_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
return GovernanceMetrics(**data)
|
||||
except Exception as e:
|
||||
print(f"[警告] 加载度量指标失败: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# 全局单例
|
||||
_policy: Optional[DataGovernancePolicy] = None
|
||||
|
||||
|
||||
def get_governance_policy(workspace_path: Path) -> DataGovernancePolicy:
|
||||
"""获取数据治理策略单例"""
|
||||
global _policy
|
||||
if _policy is None:
|
||||
_policy = DataGovernancePolicy(workspace_path)
|
||||
return _policy
|
||||
|
||||
311
history/data_sanitizer.py
Normal file
311
history/data_sanitizer.py
Normal file
@@ -0,0 +1,311 @@
|
||||
"""
|
||||
数据脱敏模块
|
||||
对历史记录中的敏感信息进行识别和脱敏处理
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, List, Tuple, Set
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class SensitiveType(Enum):
|
||||
"""敏感信息类型"""
|
||||
FILE_PATH = "file_path" # 文件路径
|
||||
IP_ADDRESS = "ip_address" # IP地址
|
||||
EMAIL = "email" # 邮箱
|
||||
PHONE = "phone" # 电话号码
|
||||
API_KEY = "api_key" # API密钥
|
||||
PASSWORD = "password" # 密码
|
||||
TOKEN = "token" # Token
|
||||
DATABASE_URI = "database_uri" # 数据库连接串
|
||||
CREDIT_CARD = "credit_card" # 信用卡号
|
||||
ID_CARD = "id_card" # 身份证号
|
||||
|
||||
|
||||
@dataclass
|
||||
class SensitiveMatch:
|
||||
"""敏感信息匹配结果"""
|
||||
type: SensitiveType
|
||||
value: str
|
||||
start: int
|
||||
end: int
|
||||
masked_value: str
|
||||
|
||||
|
||||
class DataSanitizer:
|
||||
"""
|
||||
数据脱敏器
|
||||
|
||||
识别并脱敏敏感信息,支持多种敏感数据类型
|
||||
"""
|
||||
|
||||
# 敏感信息正则模式
|
||||
PATTERNS = {
|
||||
SensitiveType.FILE_PATH: [
|
||||
r'[A-Za-z]:\\(?:[^\\/:*?"<>|\r\n]+\\)*[^\\/:*?"<>|\r\n]*', # Windows路径
|
||||
r'/(?:[^/\0]+/)*[^/\0]*', # Unix路径(需要额外验证)
|
||||
],
|
||||
SensitiveType.IP_ADDRESS: [
|
||||
r'\b(?:\d{1,3}\.){3}\d{1,3}\b', # IPv4
|
||||
r'\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b', # IPv6
|
||||
],
|
||||
SensitiveType.EMAIL: [
|
||||
r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
|
||||
],
|
||||
SensitiveType.PHONE: [
|
||||
r'\b1[3-9]\d{9}\b', # 中国手机号
|
||||
r'\b\d{3}-\d{4}-\d{4}\b', # 美国电话
|
||||
],
|
||||
SensitiveType.API_KEY: [
|
||||
r'\b[A-Za-z0-9_-]{32,}\b', # 通用API密钥
|
||||
r'sk-[A-Za-z0-9]{48}', # OpenAI风格
|
||||
r'AIza[0-9A-Za-z_-]{35}', # Google API
|
||||
],
|
||||
SensitiveType.PASSWORD: [
|
||||
r'(?i)password\s*[:=]\s*["\']?([^"\'\s]+)["\']?',
|
||||
r'(?i)pwd\s*[:=]\s*["\']?([^"\'\s]+)["\']?',
|
||||
],
|
||||
SensitiveType.TOKEN: [
|
||||
r'(?i)token\s*[:=]\s*["\']?([A-Za-z0-9_.-]+)["\']?',
|
||||
r'(?i)bearer\s+([A-Za-z0-9_.-]+)',
|
||||
],
|
||||
SensitiveType.DATABASE_URI: [
|
||||
r'(?i)(mysql|postgresql|mongodb|redis)://[^\s]+',
|
||||
],
|
||||
SensitiveType.CREDIT_CARD: [
|
||||
r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b',
|
||||
],
|
||||
SensitiveType.ID_CARD: [
|
||||
r'\b\d{17}[\dXx]\b', # 中国身份证
|
||||
],
|
||||
}
|
||||
|
||||
# 需要特殊处理的类型(避免误判)
|
||||
SPECIAL_VALIDATION = {
|
||||
SensitiveType.FILE_PATH: '_validate_file_path',
|
||||
SensitiveType.API_KEY: '_validate_api_key',
|
||||
}
|
||||
|
||||
def __init__(self, enabled_types: Set[SensitiveType] = None):
|
||||
"""
|
||||
初始化脱敏器
|
||||
|
||||
Args:
|
||||
enabled_types: 启用的敏感类型,None表示全部启用
|
||||
"""
|
||||
self.enabled_types = enabled_types or set(SensitiveType)
|
||||
self._compile_patterns()
|
||||
|
||||
def _compile_patterns(self):
|
||||
"""编译正则表达式"""
|
||||
self.compiled_patterns: Dict[SensitiveType, List[re.Pattern]] = {}
|
||||
for sens_type in self.enabled_types:
|
||||
if sens_type in self.PATTERNS:
|
||||
self.compiled_patterns[sens_type] = [
|
||||
re.compile(pattern) for pattern in self.PATTERNS[sens_type]
|
||||
]
|
||||
|
||||
def _validate_file_path(self, text: str) -> bool:
|
||||
"""验证是否为真实文件路径(避免误判)"""
|
||||
# 排除短路径和常见误判
|
||||
if len(text) < 5:
|
||||
return False
|
||||
|
||||
# 必须包含常见路径特征
|
||||
path_indicators = ['\\', '/', '.py', '.txt', '.json', '.log', 'Users', 'Program']
|
||||
return any(indicator in text for indicator in path_indicators)
|
||||
|
||||
def _validate_api_key(self, text: str) -> bool:
|
||||
"""验证是否为真实API密钥(避免误判)"""
|
||||
# 排除纯数字或纯字母
|
||||
has_digit = any(c.isdigit() for c in text)
|
||||
has_alpha = any(c.isalpha() for c in text)
|
||||
has_special = any(c in '-_' for c in text)
|
||||
# 长度要求
|
||||
return has_digit and has_alpha and len(text) >= 20
|
||||
|
||||
def find_sensitive_data(self, text: str) -> List[SensitiveMatch]:
|
||||
"""
|
||||
查找文本中的敏感信息
|
||||
|
||||
Args:
|
||||
text: 待检测文本
|
||||
|
||||
Returns:
|
||||
敏感信息匹配列表
|
||||
"""
|
||||
matches = []
|
||||
|
||||
for sens_type, patterns in self.compiled_patterns.items():
|
||||
for pattern in patterns:
|
||||
for match in pattern.finditer(text):
|
||||
value = match.group(0)
|
||||
|
||||
# 特殊验证
|
||||
if sens_type in self.SPECIAL_VALIDATION:
|
||||
validator = getattr(self, self.SPECIAL_VALIDATION[sens_type])
|
||||
if not validator(value):
|
||||
continue
|
||||
|
||||
# 生成脱敏值
|
||||
masked = self._mask_value(value, sens_type)
|
||||
|
||||
matches.append(SensitiveMatch(
|
||||
type=sens_type,
|
||||
value=value,
|
||||
start=match.start(),
|
||||
end=match.end(),
|
||||
masked_value=masked
|
||||
))
|
||||
|
||||
# 按位置排序,避免重叠
|
||||
matches.sort(key=lambda m: m.start)
|
||||
return self._remove_overlaps(matches)
|
||||
|
||||
def _remove_overlaps(self, matches: List[SensitiveMatch]) -> List[SensitiveMatch]:
|
||||
"""移除重叠的匹配项(保留优先级高的)"""
|
||||
if not matches:
|
||||
return []
|
||||
|
||||
# 定义优先级(越小越优先)
|
||||
priority = {
|
||||
SensitiveType.PASSWORD: 1,
|
||||
SensitiveType.API_KEY: 2,
|
||||
SensitiveType.TOKEN: 3,
|
||||
SensitiveType.DATABASE_URI: 4,
|
||||
SensitiveType.CREDIT_CARD: 5,
|
||||
SensitiveType.ID_CARD: 6,
|
||||
SensitiveType.EMAIL: 7,
|
||||
SensitiveType.PHONE: 8,
|
||||
SensitiveType.IP_ADDRESS: 9,
|
||||
SensitiveType.FILE_PATH: 10,
|
||||
}
|
||||
|
||||
result = []
|
||||
last_end = -1
|
||||
|
||||
for match in sorted(matches, key=lambda m: (m.start, priority.get(m.type, 99))):
|
||||
if match.start >= last_end:
|
||||
result.append(match)
|
||||
last_end = match.end
|
||||
|
||||
return result
|
||||
|
||||
def _mask_value(self, value: str, sens_type: SensitiveType) -> str:
|
||||
"""
|
||||
生成脱敏值
|
||||
|
||||
Args:
|
||||
value: 原始值
|
||||
sens_type: 敏感类型
|
||||
|
||||
Returns:
|
||||
脱敏后的值
|
||||
"""
|
||||
if sens_type == SensitiveType.FILE_PATH:
|
||||
# 保留文件名,隐藏路径
|
||||
parts = value.replace('\\', '/').split('/')
|
||||
if len(parts) > 1:
|
||||
return f"***/{parts[-1]}"
|
||||
return "***"
|
||||
|
||||
elif sens_type == SensitiveType.EMAIL:
|
||||
# 保留首尾字符
|
||||
parts = value.split('@')
|
||||
if len(parts) == 2:
|
||||
name = parts[0]
|
||||
domain = parts[1]
|
||||
masked_name = name[0] + '***' + name[-1] if len(name) > 2 else '***'
|
||||
return f"{masked_name}@{domain}"
|
||||
|
||||
elif sens_type == SensitiveType.PHONE:
|
||||
# 保留前3后4
|
||||
if len(value) >= 11:
|
||||
return value[:3] + '****' + value[-4:]
|
||||
|
||||
elif sens_type == SensitiveType.IP_ADDRESS:
|
||||
# 保留前两段
|
||||
parts = value.split('.')
|
||||
if len(parts) == 4:
|
||||
return f"{parts[0]}.{parts[1]}.*.*"
|
||||
|
||||
elif sens_type == SensitiveType.CREDIT_CARD:
|
||||
# 只保留后4位
|
||||
digits = re.sub(r'[\s-]', '', value)
|
||||
return '**** **** **** ' + digits[-4:]
|
||||
|
||||
elif sens_type == SensitiveType.ID_CARD:
|
||||
# 保留前6后4
|
||||
return value[:6] + '********' + value[-4:]
|
||||
|
||||
# 默认:完全隐藏
|
||||
return f"[{sens_type.value.upper()}_MASKED]"
|
||||
|
||||
def sanitize(self, text: str) -> Tuple[str, List[SensitiveMatch]]:
|
||||
"""
|
||||
脱敏文本
|
||||
|
||||
Args:
|
||||
text: 原始文本
|
||||
|
||||
Returns:
|
||||
(脱敏后的文本, 匹配列表)
|
||||
"""
|
||||
matches = self.find_sensitive_data(text)
|
||||
|
||||
if not matches:
|
||||
return text, []
|
||||
|
||||
# 从后往前替换,避免位置偏移
|
||||
result = text
|
||||
for match in reversed(matches):
|
||||
result = result[:match.start] + match.masked_value + result[match.end:]
|
||||
|
||||
return result, matches
|
||||
|
||||
def get_sensitivity_score(self, text: str) -> float:
|
||||
"""
|
||||
计算文本的敏感度评分(0-1)
|
||||
|
||||
Args:
|
||||
text: 待评估文本
|
||||
|
||||
Returns:
|
||||
敏感度评分
|
||||
"""
|
||||
matches = self.find_sensitive_data(text)
|
||||
|
||||
if not matches:
|
||||
return 0.0
|
||||
|
||||
# 根据敏感类型加权
|
||||
weights = {
|
||||
SensitiveType.PASSWORD: 1.0,
|
||||
SensitiveType.API_KEY: 1.0,
|
||||
SensitiveType.TOKEN: 0.9,
|
||||
SensitiveType.DATABASE_URI: 0.9,
|
||||
SensitiveType.CREDIT_CARD: 1.0,
|
||||
SensitiveType.ID_CARD: 1.0,
|
||||
SensitiveType.EMAIL: 0.6,
|
||||
SensitiveType.PHONE: 0.6,
|
||||
SensitiveType.IP_ADDRESS: 0.5,
|
||||
SensitiveType.FILE_PATH: 0.3,
|
||||
}
|
||||
|
||||
total_weight = sum(weights.get(m.type, 0.5) for m in matches)
|
||||
# 归一化到 0-1
|
||||
return min(1.0, total_weight / 3.0)
|
||||
|
||||
|
||||
# 全局单例
|
||||
_sanitizer: DataSanitizer = None
|
||||
|
||||
|
||||
def get_sanitizer() -> DataSanitizer:
|
||||
"""获取数据脱敏器单例"""
|
||||
global _sanitizer
|
||||
if _sanitizer is None:
|
||||
_sanitizer = DataSanitizer()
|
||||
return _sanitizer
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""
|
||||
任务历史记录管理器
|
||||
保存和加载任务执行历史
|
||||
保存和加载任务执行历史,集成数据治理策略
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -9,6 +9,8 @@ from pathlib import Path
|
||||
from typing import Optional, List
|
||||
from dataclasses import dataclass, asdict
|
||||
|
||||
from history.data_governance import get_governance_policy, GovernanceMetrics
|
||||
|
||||
|
||||
@dataclass
|
||||
class TaskRecord:
|
||||
@@ -26,16 +28,19 @@ class TaskRecord:
|
||||
stderr: str
|
||||
log_path: str
|
||||
task_summary: str = "" # 任务摘要(由小模型生成)
|
||||
_governance: dict = None # 治理元数据
|
||||
_sanitization: dict = None # 脱敏信息
|
||||
|
||||
|
||||
class HistoryManager:
|
||||
"""
|
||||
历史记录管理器
|
||||
|
||||
将任务历史保存为 JSON 文件
|
||||
将任务历史保存为 JSON 文件,集成数据治理策略
|
||||
"""
|
||||
|
||||
MAX_HISTORY_SIZE = 100 # 最多保存 100 条记录
|
||||
AUTO_CLEANUP_ENABLED = True # 自动清理过期数据
|
||||
|
||||
def __init__(self, workspace_path: Optional[Path] = None):
|
||||
if workspace_path:
|
||||
@@ -45,7 +50,15 @@ class HistoryManager:
|
||||
|
||||
self.history_file = self.workspace / "history.json"
|
||||
self._history: List[TaskRecord] = []
|
||||
|
||||
# 初始化数据治理策略
|
||||
self.governance = get_governance_policy(self.workspace)
|
||||
|
||||
self._load()
|
||||
|
||||
# 启动时自动清理过期数据
|
||||
if self.AUTO_CLEANUP_ENABLED:
|
||||
self._auto_cleanup()
|
||||
|
||||
def _load(self):
|
||||
"""从文件加载历史记录"""
|
||||
@@ -53,7 +66,14 @@ class HistoryManager:
|
||||
try:
|
||||
with open(self.history_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
self._history = [TaskRecord(**record) for record in data]
|
||||
self._history = []
|
||||
for record in data:
|
||||
# 兼容旧数据(没有治理字段)
|
||||
if '_governance' not in record:
|
||||
record['_governance'] = None
|
||||
if '_sanitization' not in record:
|
||||
record['_sanitization'] = None
|
||||
self._history.append(TaskRecord(**record))
|
||||
except (json.JSONDecodeError, TypeError, KeyError) as e:
|
||||
print(f"[警告] 加载历史记录失败: {e}")
|
||||
self._history = []
|
||||
@@ -61,14 +81,29 @@ class HistoryManager:
|
||||
self._history = []
|
||||
|
||||
def _save(self):
|
||||
"""保存历史记录到文件"""
|
||||
"""保存历史记录到文件(应用数据治理策略)"""
|
||||
try:
|
||||
# 确保目录存在
|
||||
self.history_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 应用数据治理策略
|
||||
governed_data = []
|
||||
for record in self._history:
|
||||
record_dict = asdict(record)
|
||||
|
||||
# 如果记录还没有治理元数据,应用策略
|
||||
if not record_dict.get('_governance'):
|
||||
record_dict = self.governance.apply_policy(record_dict)
|
||||
|
||||
governed_data.append(record_dict)
|
||||
|
||||
with open(self.history_file, 'w', encoding='utf-8') as f:
|
||||
data = [asdict(record) for record in self._history]
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
json.dump(governed_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# 收集并保存度量指标
|
||||
metrics = self.governance.collect_metrics(governed_data)
|
||||
self.governance.save_metrics(metrics)
|
||||
|
||||
except Exception as e:
|
||||
print(f"[警告] 保存历史记录失败: {e}")
|
||||
|
||||
@@ -216,56 +251,136 @@ class HistoryManager:
|
||||
'avg_duration_ms': int(avg_duration)
|
||||
}
|
||||
|
||||
def find_similar_success(self, user_input: str, threshold: float = 0.6) -> Optional[TaskRecord]:
|
||||
def find_similar_success(
|
||||
self,
|
||||
user_input: str,
|
||||
threshold: float = 0.6,
|
||||
return_details: bool = False
|
||||
) -> Optional[TaskRecord] | tuple:
|
||||
"""
|
||||
查找相似的成功任务
|
||||
|
||||
使用简单的关键词匹配来判断相似度
|
||||
查找相似的成功任务(增强版:结构化特征匹配)
|
||||
|
||||
Args:
|
||||
user_input: 用户输入
|
||||
threshold: 相似度阈值
|
||||
return_details: 是否返回详细信息(相似度和差异列表)
|
||||
|
||||
Returns:
|
||||
最相似的成功任务记录,如果没有则返回 None
|
||||
如果 return_details=False: 最相似的成功任务记录,如果没有则返回 None
|
||||
如果 return_details=True: (TaskRecord, 相似度, 差异列表) 或 None
|
||||
"""
|
||||
# 提取关键词
|
||||
def extract_keywords(text: str) -> set:
|
||||
# 简单分词:按空格和标点分割
|
||||
import re
|
||||
words = re.findall(r'[\u4e00-\u9fa5]+|[a-zA-Z]+', text.lower())
|
||||
# 过滤掉太短的词
|
||||
return set(w for w in words if len(w) >= 2)
|
||||
from history.task_features import get_task_matcher
|
||||
|
||||
input_keywords = extract_keywords(user_input)
|
||||
if not input_keywords:
|
||||
return None
|
||||
matcher = get_task_matcher()
|
||||
|
||||
best_match = None
|
||||
best_score = 0.0
|
||||
best_differences = []
|
||||
|
||||
for record in self._history:
|
||||
if not record.success:
|
||||
continue
|
||||
|
||||
record_keywords = extract_keywords(record.user_input)
|
||||
if not record_keywords:
|
||||
continue
|
||||
|
||||
# 计算 Jaccard 相似度
|
||||
intersection = len(input_keywords & record_keywords)
|
||||
union = len(input_keywords | record_keywords)
|
||||
score = intersection / union if union > 0 else 0
|
||||
# 使用增强的特征匹配
|
||||
score, differences = matcher.calculate_similarity(
|
||||
user_input,
|
||||
record.user_input
|
||||
)
|
||||
|
||||
if score > best_score and score >= threshold:
|
||||
best_score = score
|
||||
best_match = record
|
||||
best_differences = differences
|
||||
|
||||
return best_match
|
||||
if best_match is None:
|
||||
return None
|
||||
|
||||
if return_details:
|
||||
return (best_match, best_score, best_differences)
|
||||
else:
|
||||
return best_match
|
||||
|
||||
def get_successful_records(self) -> List[TaskRecord]:
|
||||
"""获取所有成功的任务记录"""
|
||||
return [r for r in self._history if r.success]
|
||||
|
||||
def _auto_cleanup(self):
|
||||
"""自动清理过期数据"""
|
||||
try:
|
||||
records_data = [asdict(r) for r in self._history]
|
||||
kept_records, archived, deleted = self.governance.cleanup_expired(records_data)
|
||||
|
||||
if archived > 0 or deleted > 0:
|
||||
# 更新历史记录
|
||||
self._history = []
|
||||
for record_dict in kept_records:
|
||||
if '_governance' not in record_dict:
|
||||
record_dict['_governance'] = None
|
||||
if '_sanitization' not in record_dict:
|
||||
record_dict['_sanitization'] = None
|
||||
self._history.append(TaskRecord(**record_dict))
|
||||
|
||||
self._save()
|
||||
print(f"[数据治理] 自动清理完成: 归档 {archived} 条, 删除 {deleted} 条")
|
||||
except Exception as e:
|
||||
print(f"[警告] 自动清理失败: {e}")
|
||||
|
||||
def manual_cleanup(self) -> dict:
|
||||
"""
|
||||
手动触发数据清理
|
||||
|
||||
Returns:
|
||||
清理统计信息
|
||||
"""
|
||||
records_data = [asdict(r) for r in self._history]
|
||||
kept_records, archived, deleted = self.governance.cleanup_expired(records_data)
|
||||
|
||||
# 更新历史记录
|
||||
self._history = []
|
||||
for record_dict in kept_records:
|
||||
if '_governance' not in record_dict:
|
||||
record_dict['_governance'] = None
|
||||
if '_sanitization' not in record_dict:
|
||||
record_dict['_sanitization'] = None
|
||||
self._history.append(TaskRecord(**record_dict))
|
||||
|
||||
self._save()
|
||||
|
||||
return {
|
||||
'archived': archived,
|
||||
'deleted': deleted,
|
||||
'remaining': len(self._history)
|
||||
}
|
||||
|
||||
def get_governance_metrics(self) -> Optional[GovernanceMetrics]:
|
||||
"""获取数据治理度量指标"""
|
||||
return self.governance.load_metrics()
|
||||
|
||||
def export_sanitized(self, output_path: Path) -> int:
|
||||
"""
|
||||
导出脱敏后的历史记录
|
||||
|
||||
Args:
|
||||
output_path: 导出文件路径
|
||||
|
||||
Returns:
|
||||
导出的记录数量
|
||||
"""
|
||||
sanitized_data = []
|
||||
|
||||
for record in self._history:
|
||||
record_dict = asdict(record)
|
||||
|
||||
# 确保已应用治理策略
|
||||
if not record_dict.get('_governance'):
|
||||
record_dict = self.governance.apply_policy(record_dict)
|
||||
|
||||
sanitized_data.append(record_dict)
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(sanitized_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
return len(sanitized_data)
|
||||
|
||||
|
||||
# 全局单例
|
||||
|
||||
252
history/reuse_metrics.py
Normal file
252
history/reuse_metrics.py
Normal file
@@ -0,0 +1,252 @@
|
||||
"""
|
||||
任务复用度量指标收集模块
|
||||
"""
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, List
|
||||
from dataclasses import dataclass, asdict
|
||||
|
||||
|
||||
@dataclass
|
||||
class ReuseEvent:
|
||||
"""复用事件记录"""
|
||||
timestamp: str
|
||||
original_task_id: str # 被复用的任务 ID
|
||||
new_task_id: Optional[str] # 新任务 ID(如果执行了)
|
||||
similarity_score: float # 相似度分数
|
||||
user_action: str # 用户操作:accepted/rejected/rollback/failed
|
||||
differences_count: int # 差异数量
|
||||
critical_differences: int # 关键差异数量
|
||||
execution_success: Optional[bool] # 执行是否成功(如果执行了)
|
||||
|
||||
|
||||
class ReuseMetrics:
|
||||
"""复用指标管理器"""
|
||||
|
||||
def __init__(self, workspace_path: Path):
|
||||
self.workspace = workspace_path
|
||||
self.metrics_file = workspace_path / "reuse_metrics.json"
|
||||
self._events: List[ReuseEvent] = []
|
||||
self._load()
|
||||
|
||||
def _load(self):
|
||||
"""加载指标数据"""
|
||||
if self.metrics_file.exists():
|
||||
try:
|
||||
with open(self.metrics_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
self._events = [ReuseEvent(**event) for event in data]
|
||||
except Exception as e:
|
||||
print(f"[警告] 加载复用指标失败: {e}")
|
||||
self._events = []
|
||||
|
||||
def _save(self):
|
||||
"""保存指标数据"""
|
||||
try:
|
||||
self.metrics_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(self.metrics_file, 'w', encoding='utf-8') as f:
|
||||
data = [asdict(event) for event in self._events]
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
except Exception as e:
|
||||
print(f"[警告] 保存复用指标失败: {e}")
|
||||
|
||||
def record_reuse_offered(
|
||||
self,
|
||||
original_task_id: str,
|
||||
similarity_score: float,
|
||||
differences_count: int,
|
||||
critical_differences: int
|
||||
):
|
||||
"""记录复用建议被提供"""
|
||||
event = ReuseEvent(
|
||||
timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
original_task_id=original_task_id,
|
||||
new_task_id=None,
|
||||
similarity_score=similarity_score,
|
||||
user_action='offered',
|
||||
differences_count=differences_count,
|
||||
critical_differences=critical_differences,
|
||||
execution_success=None
|
||||
)
|
||||
self._events.append(event)
|
||||
self._save()
|
||||
return event
|
||||
|
||||
def record_reuse_accepted(
|
||||
self,
|
||||
original_task_id: str,
|
||||
similarity_score: float,
|
||||
differences_count: int,
|
||||
critical_differences: int
|
||||
):
|
||||
"""记录用户接受复用"""
|
||||
event = ReuseEvent(
|
||||
timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
original_task_id=original_task_id,
|
||||
new_task_id=None,
|
||||
similarity_score=similarity_score,
|
||||
user_action='accepted',
|
||||
differences_count=differences_count,
|
||||
critical_differences=critical_differences,
|
||||
execution_success=None
|
||||
)
|
||||
self._events.append(event)
|
||||
self._save()
|
||||
return event
|
||||
|
||||
def record_reuse_rejected(
|
||||
self,
|
||||
original_task_id: str,
|
||||
similarity_score: float,
|
||||
differences_count: int,
|
||||
critical_differences: int
|
||||
):
|
||||
"""记录用户拒绝复用"""
|
||||
event = ReuseEvent(
|
||||
timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
original_task_id=original_task_id,
|
||||
new_task_id=None,
|
||||
similarity_score=similarity_score,
|
||||
user_action='rejected',
|
||||
differences_count=differences_count,
|
||||
critical_differences=critical_differences,
|
||||
execution_success=None
|
||||
)
|
||||
self._events.append(event)
|
||||
self._save()
|
||||
return event
|
||||
|
||||
def record_reuse_execution(
|
||||
self,
|
||||
original_task_id: str,
|
||||
new_task_id: str,
|
||||
success: bool
|
||||
):
|
||||
"""记录复用后的执行结果"""
|
||||
# 查找最近的 accepted 事件并更新
|
||||
for event in reversed(self._events):
|
||||
if (event.original_task_id == original_task_id and
|
||||
event.user_action == 'accepted' and
|
||||
event.new_task_id is None):
|
||||
event.new_task_id = new_task_id
|
||||
event.execution_success = success
|
||||
self._save()
|
||||
return event
|
||||
|
||||
# 如果没找到,创建新记录
|
||||
event = ReuseEvent(
|
||||
timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
original_task_id=original_task_id,
|
||||
new_task_id=new_task_id,
|
||||
similarity_score=0.0,
|
||||
user_action='executed',
|
||||
differences_count=0,
|
||||
critical_differences=0,
|
||||
execution_success=success
|
||||
)
|
||||
self._events.append(event)
|
||||
self._save()
|
||||
return event
|
||||
|
||||
def record_reuse_rollback(
|
||||
self,
|
||||
original_task_id: str,
|
||||
new_task_id: str
|
||||
):
|
||||
"""记录复用后回滚(用户撤销/重做)"""
|
||||
event = ReuseEvent(
|
||||
timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
original_task_id=original_task_id,
|
||||
new_task_id=new_task_id,
|
||||
similarity_score=0.0,
|
||||
user_action='rollback',
|
||||
differences_count=0,
|
||||
critical_differences=0,
|
||||
execution_success=False
|
||||
)
|
||||
self._events.append(event)
|
||||
self._save()
|
||||
return event
|
||||
|
||||
def get_statistics(self) -> Dict:
|
||||
"""获取统计数据"""
|
||||
if not self._events:
|
||||
return {
|
||||
'total_offered': 0,
|
||||
'total_accepted': 0,
|
||||
'total_rejected': 0,
|
||||
'total_executed': 0,
|
||||
'total_rollback': 0,
|
||||
'acceptance_rate': 0.0,
|
||||
'rejection_rate': 0.0,
|
||||
'success_rate': 0.0,
|
||||
'failure_rate': 0.0,
|
||||
'rollback_rate': 0.0,
|
||||
'avg_similarity': 0.0,
|
||||
'avg_differences': 0.0,
|
||||
'avg_critical_differences': 0.0
|
||||
}
|
||||
|
||||
offered = [e for e in self._events if e.user_action == 'offered']
|
||||
accepted = [e for e in self._events if e.user_action == 'accepted']
|
||||
rejected = [e for e in self._events if e.user_action == 'rejected']
|
||||
executed = [e for e in self._events if e.execution_success is not None]
|
||||
rollback = [e for e in self._events if e.user_action == 'rollback']
|
||||
|
||||
total_offered = len(offered)
|
||||
total_accepted = len(accepted)
|
||||
total_rejected = len(rejected)
|
||||
total_executed = len(executed)
|
||||
total_rollback = len(rollback)
|
||||
|
||||
# 计算成功和失败
|
||||
successful = [e for e in executed if e.execution_success]
|
||||
failed = [e for e in executed if not e.execution_success]
|
||||
|
||||
# 计算率
|
||||
acceptance_rate = total_accepted / total_offered if total_offered > 0 else 0.0
|
||||
rejection_rate = total_rejected / total_offered if total_offered > 0 else 0.0
|
||||
success_rate = len(successful) / total_executed if total_executed > 0 else 0.0
|
||||
failure_rate = len(failed) / total_executed if total_executed > 0 else 0.0
|
||||
rollback_rate = total_rollback / total_executed if total_executed > 0 else 0.0
|
||||
|
||||
# 平均值
|
||||
all_events = offered + accepted + rejected
|
||||
avg_similarity = sum(e.similarity_score for e in all_events) / len(all_events) if all_events else 0.0
|
||||
avg_differences = sum(e.differences_count for e in all_events) / len(all_events) if all_events else 0.0
|
||||
avg_critical_differences = sum(e.critical_differences for e in all_events) / len(all_events) if all_events else 0.0
|
||||
|
||||
return {
|
||||
'total_offered': total_offered,
|
||||
'total_accepted': total_accepted,
|
||||
'total_rejected': total_rejected,
|
||||
'total_executed': total_executed,
|
||||
'total_rollback': total_rollback,
|
||||
'acceptance_rate': acceptance_rate,
|
||||
'rejection_rate': rejection_rate,
|
||||
'success_rate': success_rate,
|
||||
'failure_rate': failure_rate,
|
||||
'rollback_rate': rollback_rate,
|
||||
'avg_similarity': avg_similarity,
|
||||
'avg_differences': avg_differences,
|
||||
'avg_critical_differences': avg_critical_differences
|
||||
}
|
||||
|
||||
def get_recent_events(self, count: int = 20) -> List[ReuseEvent]:
|
||||
"""获取最近的事件"""
|
||||
return self._events[-count:] if self._events else []
|
||||
|
||||
|
||||
# 全局单例
|
||||
_metrics: Optional[ReuseMetrics] = None
|
||||
|
||||
|
||||
def get_reuse_metrics(workspace_path: Path) -> ReuseMetrics:
|
||||
"""获取复用指标管理器单例"""
|
||||
global _metrics
|
||||
if _metrics is None:
|
||||
_metrics = ReuseMetrics(workspace_path)
|
||||
return _metrics
|
||||
|
||||
380
history/task_features.py
Normal file
380
history/task_features.py
Normal file
@@ -0,0 +1,380 @@
|
||||
"""
|
||||
任务特征提取与匹配模块
|
||||
用于更精确的相似任务识别
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, List, Set, Optional, Tuple
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@dataclass
|
||||
class TaskFeatures:
|
||||
"""任务结构化特征"""
|
||||
# 基础信息
|
||||
raw_input: str
|
||||
keywords: Set[str]
|
||||
|
||||
# 关键参数
|
||||
file_formats: Set[str] # 文件格式(如 .txt, .csv, .json)
|
||||
directory_paths: Set[str] # 目录路径
|
||||
file_names: Set[str] # 文件名
|
||||
naming_patterns: List[str] # 命名规则(如 "按日期", "按序号")
|
||||
|
||||
# 操作类型
|
||||
operations: Set[str] # 操作类型(如 "批量重命名", "文件转换", "数据处理")
|
||||
|
||||
# 数量/范围参数
|
||||
quantities: List[str] # 数量相关(如 "100个", "所有")
|
||||
|
||||
# 其他约束
|
||||
constraints: List[str] # 其他约束条件
|
||||
|
||||
|
||||
@dataclass
|
||||
class TaskDifference:
|
||||
"""任务差异描述"""
|
||||
category: str # 差异类别
|
||||
field: str # 字段名
|
||||
current_value: str # 当前任务的值
|
||||
history_value: str # 历史任务的值
|
||||
importance: str # 重要性:critical/high/medium/low
|
||||
|
||||
|
||||
class TaskFeatureExtractor:
|
||||
"""任务特征提取器"""
|
||||
|
||||
# 文件格式模式
|
||||
FILE_FORMAT_PATTERN = r'\.(txt|csv|json|xml|xlsx?|docx?|pdf|png|jpe?g|gif|mp[34]|avi|mov|zip|rar|7z|py|js|java|cpp|html?|css)'
|
||||
|
||||
# 目录路径模式(Windows 和 Unix)
|
||||
DIR_PATH_PATTERN = r'(?:[a-zA-Z]:\\[\w\\\s\u4e00-\u9fa5.-]+|/[\w/\s\u4e00-\u9fa5.-]+|[./][\w/\\\s\u4e00-\u9fa5.-]+)'
|
||||
|
||||
# 文件名模式
|
||||
FILE_NAME_PATTERN = r'[\w\u4e00-\u9fa5.-]+\.[a-zA-Z0-9]+'
|
||||
|
||||
# 数量模式
|
||||
QUANTITY_PATTERN = r'(\d+\s*[个张份条篇页行列]|所有|全部|批量)'
|
||||
|
||||
# 操作关键词映射
|
||||
OPERATION_KEYWORDS = {
|
||||
'重命名': ['重命名', '改名', '命名', '更名'],
|
||||
'转换': ['转换', '转为', '转成', '变成', '改成'],
|
||||
'批量处理': ['批量', '批处理', '一次性'],
|
||||
'复制': ['复制', '拷贝', 'copy'],
|
||||
'移动': ['移动', '转移', 'move'],
|
||||
'删除': ['删除', '清理', '移除'],
|
||||
'合并': ['合并', '整合', '汇总'],
|
||||
'分割': ['分割', '拆分', '切分'],
|
||||
'压缩': ['压缩', '打包'],
|
||||
'解压': ['解压', '解包', '提取'],
|
||||
'排序': ['排序', '排列'],
|
||||
'筛选': ['筛选', '过滤', '查找'],
|
||||
'统计': ['统计', '计数', '汇总'],
|
||||
'生成': ['生成', '创建', '制作'],
|
||||
}
|
||||
|
||||
# 命名规则关键词
|
||||
NAMING_PATTERNS = {
|
||||
'按日期': ['日期', '时间', 'date', 'time'],
|
||||
'按序号': ['序号', '编号', '数字', '顺序'],
|
||||
'按前缀': ['前缀', '开头'],
|
||||
'按后缀': ['后缀', '结尾'],
|
||||
'按内容': ['内容', '根据'],
|
||||
}
|
||||
|
||||
def extract(self, user_input: str) -> TaskFeatures:
|
||||
"""
|
||||
从用户输入中提取结构化特征
|
||||
|
||||
Args:
|
||||
user_input: 用户输入文本
|
||||
|
||||
Returns:
|
||||
TaskFeatures: 提取的特征
|
||||
"""
|
||||
# 提取关键词
|
||||
keywords = self._extract_keywords(user_input)
|
||||
|
||||
# 提取文件格式
|
||||
file_formats = self._extract_file_formats(user_input)
|
||||
|
||||
# 提取目录路径
|
||||
directory_paths = self._extract_directory_paths(user_input)
|
||||
|
||||
# 提取文件名
|
||||
file_names = self._extract_file_names(user_input)
|
||||
|
||||
# 提取命名规则
|
||||
naming_patterns = self._extract_naming_patterns(user_input)
|
||||
|
||||
# 提取操作类型
|
||||
operations = self._extract_operations(user_input)
|
||||
|
||||
# 提取数量信息
|
||||
quantities = self._extract_quantities(user_input)
|
||||
|
||||
# 提取其他约束
|
||||
constraints = self._extract_constraints(user_input)
|
||||
|
||||
return TaskFeatures(
|
||||
raw_input=user_input,
|
||||
keywords=keywords,
|
||||
file_formats=file_formats,
|
||||
directory_paths=directory_paths,
|
||||
file_names=file_names,
|
||||
naming_patterns=naming_patterns,
|
||||
operations=operations,
|
||||
quantities=quantities,
|
||||
constraints=constraints
|
||||
)
|
||||
|
||||
def _extract_keywords(self, text: str) -> Set[str]:
|
||||
"""提取关键词(基础分词)"""
|
||||
words = re.findall(r'[\u4e00-\u9fa5]+|[a-zA-Z]+', text.lower())
|
||||
return set(w for w in words if len(w) >= 2)
|
||||
|
||||
def _extract_file_formats(self, text: str) -> Set[str]:
|
||||
"""提取文件格式"""
|
||||
matches = re.findall(self.FILE_FORMAT_PATTERN, text.lower())
|
||||
return set(f'.{m}' for m in matches)
|
||||
|
||||
def _extract_directory_paths(self, text: str) -> Set[str]:
|
||||
"""提取目录路径"""
|
||||
matches = re.findall(self.DIR_PATH_PATTERN, text)
|
||||
# 标准化路径
|
||||
normalized = set()
|
||||
for path in matches:
|
||||
try:
|
||||
p = Path(path)
|
||||
normalized.add(str(p.resolve()))
|
||||
except:
|
||||
normalized.add(path)
|
||||
return normalized
|
||||
|
||||
def _extract_file_names(self, text: str) -> Set[str]:
|
||||
"""提取文件名"""
|
||||
matches = re.findall(self.FILE_NAME_PATTERN, text)
|
||||
return set(matches)
|
||||
|
||||
def _extract_naming_patterns(self, text: str) -> List[str]:
|
||||
"""提取命名规则"""
|
||||
patterns = []
|
||||
for pattern_name, keywords in self.NAMING_PATTERNS.items():
|
||||
if any(kw in text for kw in keywords):
|
||||
patterns.append(pattern_name)
|
||||
return patterns
|
||||
|
||||
def _extract_operations(self, text: str) -> Set[str]:
|
||||
"""提取操作类型"""
|
||||
operations = set()
|
||||
for op_name, keywords in self.OPERATION_KEYWORDS.items():
|
||||
if any(kw in text for kw in keywords):
|
||||
operations.add(op_name)
|
||||
return operations
|
||||
|
||||
def _extract_quantities(self, text: str) -> List[str]:
|
||||
"""提取数量信息"""
|
||||
matches = re.findall(self.QUANTITY_PATTERN, text)
|
||||
return matches
|
||||
|
||||
def _extract_constraints(self, text: str) -> List[str]:
|
||||
"""提取其他约束条件"""
|
||||
constraints = []
|
||||
|
||||
# 条件关键词
|
||||
condition_keywords = ['如果', '当', '满足', '符合', '包含', '不包含', '大于', '小于', '等于']
|
||||
for keyword in condition_keywords:
|
||||
if keyword in text:
|
||||
# 提取包含该关键词的句子片段
|
||||
pattern = f'[^。,;]*{keyword}[^。,;]*'
|
||||
matches = re.findall(pattern, text)
|
||||
constraints.extend(matches)
|
||||
|
||||
return constraints
|
||||
|
||||
|
||||
class TaskMatcher:
|
||||
"""任务匹配器"""
|
||||
|
||||
def __init__(self):
|
||||
self.extractor = TaskFeatureExtractor()
|
||||
|
||||
def calculate_similarity(
|
||||
self,
|
||||
current_input: str,
|
||||
history_input: str
|
||||
) -> Tuple[float, List[TaskDifference]]:
|
||||
"""
|
||||
计算两个任务的相似度,并返回差异列表
|
||||
|
||||
Args:
|
||||
current_input: 当前任务输入
|
||||
history_input: 历史任务输入
|
||||
|
||||
Returns:
|
||||
(相似度分数 0-1, 差异列表)
|
||||
"""
|
||||
# 提取特征
|
||||
current_features = self.extractor.extract(current_input)
|
||||
history_features = self.extractor.extract(history_input)
|
||||
|
||||
# 计算各维度相似度和差异
|
||||
differences = []
|
||||
scores = []
|
||||
|
||||
# 1. 关键词相似度(基础权重 0.2)
|
||||
keyword_sim = self._jaccard_similarity(
|
||||
current_features.keywords,
|
||||
history_features.keywords
|
||||
)
|
||||
scores.append(('keywords', keyword_sim, 0.2))
|
||||
|
||||
# 2. 文件格式相似度(权重 0.15)
|
||||
format_sim, format_diffs = self._compare_sets(
|
||||
current_features.file_formats,
|
||||
history_features.file_formats,
|
||||
'file_formats',
|
||||
'文件格式',
|
||||
'high'
|
||||
)
|
||||
scores.append(('file_formats', format_sim, 0.15))
|
||||
differences.extend(format_diffs)
|
||||
|
||||
# 3. 目录路径相似度(权重 0.15)
|
||||
dir_sim, dir_diffs = self._compare_sets(
|
||||
current_features.directory_paths,
|
||||
history_features.directory_paths,
|
||||
'directory_paths',
|
||||
'目录路径',
|
||||
'critical'
|
||||
)
|
||||
scores.append(('directory_paths', dir_sim, 0.15))
|
||||
differences.extend(dir_diffs)
|
||||
|
||||
# 4. 命名规则相似度(权重 0.15)
|
||||
naming_sim, naming_diffs = self._compare_lists(
|
||||
current_features.naming_patterns,
|
||||
history_features.naming_patterns,
|
||||
'naming_patterns',
|
||||
'命名规则',
|
||||
'high'
|
||||
)
|
||||
scores.append(('naming_patterns', naming_sim, 0.15))
|
||||
differences.extend(naming_diffs)
|
||||
|
||||
# 5. 操作类型相似度(权重 0.2)
|
||||
op_sim, op_diffs = self._compare_sets(
|
||||
current_features.operations,
|
||||
history_features.operations,
|
||||
'operations',
|
||||
'操作类型',
|
||||
'critical'
|
||||
)
|
||||
scores.append(('operations', op_sim, 0.2))
|
||||
differences.extend(op_diffs)
|
||||
|
||||
# 6. 数量信息相似度(权重 0.1)
|
||||
qty_sim, qty_diffs = self._compare_lists(
|
||||
current_features.quantities,
|
||||
history_features.quantities,
|
||||
'quantities',
|
||||
'数量',
|
||||
'medium'
|
||||
)
|
||||
scores.append(('quantities', qty_sim, 0.1))
|
||||
differences.extend(qty_diffs)
|
||||
|
||||
# 7. 约束条件相似度(权重 0.05)
|
||||
constraint_sim, constraint_diffs = self._compare_lists(
|
||||
current_features.constraints,
|
||||
history_features.constraints,
|
||||
'constraints',
|
||||
'约束条件',
|
||||
'medium'
|
||||
)
|
||||
scores.append(('constraints', constraint_sim, 0.05))
|
||||
differences.extend(constraint_diffs)
|
||||
|
||||
# 计算加权总分
|
||||
total_score = sum(score * weight for _, score, weight in scores)
|
||||
|
||||
return total_score, differences
|
||||
|
||||
def _jaccard_similarity(self, set1: Set, set2: Set) -> float:
|
||||
"""计算 Jaccard 相似度"""
|
||||
if not set1 and not set2:
|
||||
return 1.0
|
||||
if not set1 or not set2:
|
||||
return 0.0
|
||||
|
||||
intersection = len(set1 & set2)
|
||||
union = len(set1 | set2)
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
def _compare_sets(
|
||||
self,
|
||||
current: Set[str],
|
||||
history: Set[str],
|
||||
field: str,
|
||||
display_name: str,
|
||||
importance: str
|
||||
) -> Tuple[float, List[TaskDifference]]:
|
||||
"""比较两个集合,返回相似度和差异"""
|
||||
similarity = self._jaccard_similarity(current, history)
|
||||
differences = []
|
||||
|
||||
# 找出差异
|
||||
only_current = current - history
|
||||
only_history = history - current
|
||||
|
||||
if only_current or only_history:
|
||||
differences.append(TaskDifference(
|
||||
category=display_name,
|
||||
field=field,
|
||||
current_value=', '.join(sorted(only_current)) if only_current else '(无)',
|
||||
history_value=', '.join(sorted(only_history)) if only_history else '(无)',
|
||||
importance=importance
|
||||
))
|
||||
|
||||
return similarity, differences
|
||||
|
||||
def _compare_lists(
|
||||
self,
|
||||
current: List[str],
|
||||
history: List[str],
|
||||
field: str,
|
||||
display_name: str,
|
||||
importance: str
|
||||
) -> Tuple[float, List[TaskDifference]]:
|
||||
"""比较两个列表,返回相似度和差异"""
|
||||
# 转为集合计算相似度
|
||||
current_set = set(current)
|
||||
history_set = set(history)
|
||||
similarity = self._jaccard_similarity(current_set, history_set)
|
||||
|
||||
differences = []
|
||||
if current != history:
|
||||
differences.append(TaskDifference(
|
||||
category=display_name,
|
||||
field=field,
|
||||
current_value=', '.join(current) if current else '(无)',
|
||||
history_value=', '.join(history) if history else '(无)',
|
||||
importance=importance
|
||||
))
|
||||
|
||||
return similarity, differences
|
||||
|
||||
|
||||
# 全局单例
|
||||
_matcher: Optional[TaskMatcher] = None
|
||||
|
||||
|
||||
def get_task_matcher() -> TaskMatcher:
|
||||
"""获取任务匹配器单例"""
|
||||
global _matcher
|
||||
if _matcher is None:
|
||||
_matcher = TaskMatcher()
|
||||
return _matcher
|
||||
|
||||
Reference in New Issue
Block a user