feat: refactor API key configuration and enhance application initialization

- Renamed `check_environment` to `check_api_key_configured` for clarity, simplifying the API key validation logic.
- Removed the blocking behavior of the API key check during application startup, allowing the app to run while providing a prompt for configuration.
- Updated `LocalAgentApp` to accept an `api_configured` parameter, enabling conditional messaging for API key setup.
- Enhanced the `SandboxRunner` to support backup management and improved execution result handling with detailed metrics.
- Integrated data governance strategies into the `HistoryManager`, ensuring compliance and improved data management.
- Added privacy settings and metrics tracking across various components to enhance user experience and application safety.
This commit is contained in:
Mimikko-zeus
2026-02-27 14:32:30 +08:00
parent ab5bbff6f7
commit 8a538bb950
58 changed files with 13457 additions and 350 deletions

410
history/data_governance.py Normal file
View File

@@ -0,0 +1,410 @@
"""
数据治理策略模块
实现数据分级保存、保留期管理、归档和清理策略
"""
import json
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Set
from dataclasses import dataclass, asdict
from enum import Enum
from history.data_sanitizer import get_sanitizer, SensitiveType
class DataLevel(Enum):
"""数据保存级别"""
FULL = "full" # 完整保存(无脱敏)
SANITIZED = "sanitized" # 脱敏保存
MINIMAL = "minimal" # 最小化保存(仅元数据)
ARCHIVED = "archived" # 已归档
class RetentionPolicy(Enum):
"""数据保留策略"""
SHORT = 7 # 7天
MEDIUM = 30 # 30天
LONG = 90 # 90天
PERMANENT = -1 # 永久保留
@dataclass
class DataClassification:
"""数据分类结果"""
level: DataLevel
retention_days: int
sensitivity_score: float
sensitive_fields: Set[str]
reason: str
@dataclass
class GovernanceMetrics:
"""治理度量指标"""
total_records: int
full_records: int
sanitized_records: int
minimal_records: int
archived_records: int
total_size_bytes: int
sensitive_field_hits: Dict[str, int]
expired_records: int
last_cleanup_time: str
class DataGovernancePolicy:
"""
数据治理策略
根据敏感度自动分级保存,管理数据生命周期
"""
# 字段敏感度配置
FIELD_SENSITIVITY = {
'user_input': 0.5, # 用户输入可能含敏感信息
'code': 0.7, # 代码可能含路径、密钥
'stdout': 0.6, # 输出可能含敏感数据
'stderr': 0.6, # 错误信息可能含路径
'execution_plan': 0.3, # 执行计划相对安全
'log_path': 0.4, # 日志路径
}
# 分级阈值
LEVEL_THRESHOLDS = {
DataLevel.FULL: 0.0, # 敏感度 < 0.3 完整保存
DataLevel.SANITIZED: 0.3, # 0.3 <= 敏感度 < 0.7 脱敏保存
DataLevel.MINIMAL: 0.7, # 敏感度 >= 0.7 最小化保存
}
# 保留期配置(根据数据级别)
RETENTION_CONFIG = {
DataLevel.FULL: RetentionPolicy.LONG.value, # 完整数据保留90天
DataLevel.SANITIZED: RetentionPolicy.MEDIUM.value, # 脱敏数据保留30天
DataLevel.MINIMAL: RetentionPolicy.SHORT.value, # 最小化数据保留7天
}
def __init__(self, workspace_path: Path):
self.workspace = workspace_path
self.sanitizer = get_sanitizer()
self.metrics_file = workspace_path / "governance_metrics.json"
self.archive_dir = workspace_path / "archive"
self.archive_dir.mkdir(exist_ok=True)
def classify_record(self, record_data: Dict) -> DataClassification:
"""
对记录进行分类
Args:
record_data: 记录数据字典
Returns:
数据分类结果
"""
sensitive_fields = set()
total_sensitivity = 0.0
field_count = 0
# 分析各字段敏感度
for field, weight in self.FIELD_SENSITIVITY.items():
if field in record_data and record_data[field]:
content = str(record_data[field])
field_score = self.sanitizer.get_sensitivity_score(content)
if field_score > 0.3: # 发现敏感信息
sensitive_fields.add(field)
total_sensitivity += field_score * weight
field_count += 1
# 计算综合敏感度
avg_sensitivity = total_sensitivity / field_count if field_count > 0 else 0.0
# 确定数据级别
if avg_sensitivity >= self.LEVEL_THRESHOLDS[DataLevel.MINIMAL]:
level = DataLevel.MINIMAL
reason = f"高敏感度({avg_sensitivity:.2f}),仅保留元数据"
elif avg_sensitivity >= self.LEVEL_THRESHOLDS[DataLevel.SANITIZED]:
level = DataLevel.SANITIZED
reason = f"中等敏感度({avg_sensitivity:.2f}),脱敏保存"
else:
level = DataLevel.FULL
reason = f"低敏感度({avg_sensitivity:.2f}),完整保存"
# 确定保留期
retention_days = self.RETENTION_CONFIG[level]
return DataClassification(
level=level,
retention_days=retention_days,
sensitivity_score=avg_sensitivity,
sensitive_fields=sensitive_fields,
reason=reason
)
def apply_policy(self, record_data: Dict) -> Dict:
"""
应用治理策略,返回处理后的数据
Args:
record_data: 原始记录数据
Returns:
处理后的记录数据
"""
classification = self.classify_record(record_data)
# 添加治理元数据
result = record_data.copy()
result['_governance'] = {
'level': classification.level.value,
'retention_days': classification.retention_days,
'sensitivity_score': classification.sensitivity_score,
'sensitive_fields': list(classification.sensitive_fields),
'classified_at': datetime.now().isoformat(),
'expires_at': (datetime.now() + timedelta(days=classification.retention_days)).isoformat()
}
# 根据级别处理数据
if classification.level == DataLevel.MINIMAL:
# 最小化:只保留元数据
result = self._minimize_record(result)
elif classification.level == DataLevel.SANITIZED:
# 脱敏:对敏感字段脱敏
result = self._sanitize_record(result, classification.sensitive_fields)
# FULL 级别不做处理
return result
def _minimize_record(self, record: Dict) -> Dict:
"""
最小化记录(仅保留元数据)
Args:
record: 原始记录
Returns:
最小化后的记录
"""
# 保留的字段
keep_fields = {
'task_id', 'timestamp', 'intent_label', 'intent_confidence',
'success', 'duration_ms', 'task_summary', '_governance'
}
minimal = {k: v for k, v in record.items() if k in keep_fields}
# 添加摘要信息
minimal['user_input'] = '[已删除-高敏感]'
minimal['code'] = '[已删除-高敏感]'
minimal['stdout'] = '[已删除-高敏感]'
minimal['stderr'] = '[已删除-高敏感]'
minimal['execution_plan'] = record.get('execution_plan', '')[:100] + '...'
return minimal
def _sanitize_record(self, record: Dict, sensitive_fields: Set[str]) -> Dict:
"""
脱敏记录
Args:
record: 原始记录
sensitive_fields: 需要脱敏的字段
Returns:
脱敏后的记录
"""
result = record.copy()
for field in sensitive_fields:
if field in result and result[field]:
content = str(result[field])
sanitized, matches = self.sanitizer.sanitize(content)
result[field] = sanitized
# 记录脱敏信息
if '_sanitization' not in result:
result['_sanitization'] = {}
result['_sanitization'][field] = {
'masked_count': len(matches),
'types': list(set(m.type.value for m in matches))
}
return result
def check_expiration(self, record: Dict) -> bool:
"""
检查记录是否过期
Args:
record: 记录数据
Returns:
是否过期
"""
if '_governance' not in record or record['_governance'] is None:
return False
expires_at = record['_governance'].get('expires_at')
if not expires_at:
return False
try:
expire_time = datetime.fromisoformat(expires_at)
return datetime.now() > expire_time
except (ValueError, TypeError):
return False
def archive_record(self, record: Dict) -> Path:
"""
归档记录
Args:
record: 记录数据
Returns:
归档文件路径
"""
task_id = record.get('task_id', 'unknown')
timestamp = record.get('timestamp', datetime.now().strftime('%Y%m%d_%H%M%S'))
# 生成归档文件名
archive_file = self.archive_dir / f"{task_id}_{timestamp}.json"
# 标记为已归档
record['_governance']['level'] = DataLevel.ARCHIVED.value
record['_governance']['archived_at'] = datetime.now().isoformat()
# 保存到归档目录
with open(archive_file, 'w', encoding='utf-8') as f:
json.dump(record, f, ensure_ascii=False, indent=2)
return archive_file
def cleanup_expired(self, records: List[Dict]) -> tuple[List[Dict], int, int]:
"""
清理过期记录
Args:
records: 记录列表
Returns:
(保留的记录列表, 归档数量, 删除数量)
"""
kept_records = []
archived_count = 0
deleted_count = 0
for record in records:
if not self.check_expiration(record):
kept_records.append(record)
continue
# 过期处理
level = record.get('_governance', {}).get('level')
if level == DataLevel.FULL.value:
# 完整数据:降级为脱敏
record['_governance']['level'] = DataLevel.SANITIZED.value
record['_governance']['retention_days'] = RetentionPolicy.MEDIUM.value
record['_governance']['expires_at'] = (
datetime.now() + timedelta(days=RetentionPolicy.MEDIUM.value)
).isoformat()
# 执行脱敏
sensitive_fields = set(record['_governance'].get('sensitive_fields', []))
record = self._sanitize_record(record, sensitive_fields)
kept_records.append(record)
elif level == DataLevel.SANITIZED.value:
# 脱敏数据:归档
self.archive_record(record)
archived_count += 1
else:
# 最小化数据:直接删除
deleted_count += 1
return kept_records, archived_count, deleted_count
def collect_metrics(self, records: List[Dict]) -> GovernanceMetrics:
"""
收集治理度量指标
Args:
records: 记录列表
Returns:
度量指标
"""
metrics = GovernanceMetrics(
total_records=len(records),
full_records=0,
sanitized_records=0,
minimal_records=0,
archived_records=0,
total_size_bytes=0,
sensitive_field_hits={},
expired_records=0,
last_cleanup_time=datetime.now().isoformat()
)
for record in records:
# 统计数据级别
level = record.get('_governance', {}).get('level')
if level == DataLevel.FULL.value:
metrics.full_records += 1
elif level == DataLevel.SANITIZED.value:
metrics.sanitized_records += 1
elif level == DataLevel.MINIMAL.value:
metrics.minimal_records += 1
elif level == DataLevel.ARCHIVED.value:
metrics.archived_records += 1
# 统计敏感字段命中
sensitive_fields = record.get('_governance', {}).get('sensitive_fields', [])
for field in sensitive_fields:
metrics.sensitive_field_hits[field] = metrics.sensitive_field_hits.get(field, 0) + 1
# 统计过期记录
if self.check_expiration(record):
metrics.expired_records += 1
# 估算大小
metrics.total_size_bytes += len(json.dumps(record, ensure_ascii=False))
return metrics
def save_metrics(self, metrics: GovernanceMetrics):
"""保存度量指标"""
with open(self.metrics_file, 'w', encoding='utf-8') as f:
data = asdict(metrics)
json.dump(data, f, ensure_ascii=False, indent=2)
def load_metrics(self) -> Optional[GovernanceMetrics]:
"""加载度量指标"""
if not self.metrics_file.exists():
return None
try:
with open(self.metrics_file, 'r', encoding='utf-8') as f:
data = json.load(f)
return GovernanceMetrics(**data)
except Exception as e:
print(f"[警告] 加载度量指标失败: {e}")
return None
# 全局单例
_policy: Optional[DataGovernancePolicy] = None
def get_governance_policy(workspace_path: Path) -> DataGovernancePolicy:
"""获取数据治理策略单例"""
global _policy
if _policy is None:
_policy = DataGovernancePolicy(workspace_path)
return _policy