LocalAgent/history/task_features.py

"""
任务特征提取与匹配模块
用于更精确的相似任务识别
"""

import re
from typing import Dict, List, Set, Optional, Tuple
from dataclasses import dataclass
from pathlib import Path


@dataclass
class TaskFeatures:
    """任务结构化特征"""
    # 基础信息
    raw_input: str
    keywords: Set[str]

    # 关键参数
    file_formats: Set[str]  # 文件格式（如 .txt, .csv, .json）
    directory_paths: Set[str]  # 目录路径
    file_names: Set[str]  # 文件名
    naming_patterns: List[str]  # 命名规则（如 "按日期", "按序号"）

    # 操作类型
    operations: Set[str]  # 操作类型（如 "批量重命名", "文件转换", "数据处理"）

    # 数量/范围参数
    quantities: List[str]  # 数量相关（如 "100个", "所有"）

    # 其他约束
    constraints: List[str]  # 其他约束条件


@dataclass
class TaskDifference:
    """任务差异描述"""
    category: str  # 差异类别
    field: str  # 字段名
    current_value: str  # 当前任务的值
    history_value: str  # 历史任务的值
    importance: str  # 重要性：critical/high/medium/low


class TaskFeatureExtractor:
    """任务特征提取器"""

    # 文件格式模式
    FILE_FORMAT_PATTERN = r'\.(txt|csv|json|xml|xlsx?|docx?|pdf|png|jpe?g|gif|mp[34]|avi|mov|zip|rar|7z|py|js|java|cpp|html?|css)'

    # 目录路径模式（Windows 和 Unix）
    DIR_PATH_PATTERN = r'(?:[a-zA-Z]:\\[\w\\\s\u4e00-\u9fa5.-]+|/[\w/\s\u4e00-\u9fa5.-]+|[./][\w/\\\s\u4e00-\u9fa5.-]+)'

    # 文件名模式
    FILE_NAME_PATTERN = r'[\w\u4e00-\u9fa5.-]+\.[a-zA-Z0-9]+'

    # 数量模式
    QUANTITY_PATTERN = r'(\d+\s*[个张份条篇页行列]|所有|全部|批量)'

    # 操作关键词映射
    OPERATION_KEYWORDS = {
        '重命名': ['重命名', '改名', '命名', '更名'],
        '转换': ['转换', '转为', '转成', '变成', '改成'],
        '批量处理': ['批量', '批处理', '一次性'],
        '复制': ['复制', '拷贝', 'copy'],
        '移动': ['移动', '转移', 'move'],
        '删除': ['删除', '清理', '移除'],
        '合并': ['合并', '整合', '汇总'],
        '分割': ['分割', '拆分', '切分'],
        '压缩': ['压缩', '打包'],
        '解压': ['解压', '解包', '提取'],
        '排序': ['排序', '排列'],
        '筛选': ['筛选', '过滤', '查找'],
        '统计': ['统计', '计数', '汇总'],
        '生成': ['生成', '创建', '制作'],
    }

    # 命名规则关键词
    NAMING_PATTERNS = {
        '按日期': ['日期', '时间', 'date', 'time'],
        '按序号': ['序号', '编号', '数字', '顺序'],
        '按前缀': ['前缀', '开头'],
        '按后缀': ['后缀', '结尾'],
        '按内容': ['内容', '根据'],
    }

    def extract(self, user_input: str) -> TaskFeatures:
        """
        从用户输入中提取结构化特征

        Args:
            user_input: 用户输入文本

        Returns:
            TaskFeatures: 提取的特征
        """
        # 提取关键词
        keywords = self._extract_keywords(user_input)

        # 提取文件格式
        file_formats = self._extract_file_formats(user_input)

        # 提取目录路径
        directory_paths = self._extract_directory_paths(user_input)

        # 提取文件名
        file_names = self._extract_file_names(user_input)

        # 提取命名规则
        naming_patterns = self._extract_naming_patterns(user_input)

        # 提取操作类型
        operations = self._extract_operations(user_input)

        # 提取数量信息
        quantities = self._extract_quantities(user_input)

        # 提取其他约束
        constraints = self._extract_constraints(user_input)

        return TaskFeatures(
            raw_input=user_input,
            keywords=keywords,
            file_formats=file_formats,
            directory_paths=directory_paths,
            file_names=file_names,
            naming_patterns=naming_patterns,
            operations=operations,
            quantities=quantities,
            constraints=constraints
        )

    def _extract_keywords(self, text: str) -> Set[str]:
        """提取关键词（基础分词）"""
        words = re.findall(r'[\u4e00-\u9fa5]+|[a-zA-Z]+', text.lower())
        return set(w for w in words if len(w) >= 2)

    def _extract_file_formats(self, text: str) -> Set[str]:
        """提取文件格式"""
        matches = re.findall(self.FILE_FORMAT_PATTERN, text.lower())
        return set(f'.{m}' for m in matches)

    def _extract_directory_paths(self, text: str) -> Set[str]:
        """提取目录路径"""
        matches = re.findall(self.DIR_PATH_PATTERN, text)
        # 标准化路径
        normalized = set()
        for path in matches:
            try:
                p = Path(path)
                normalized.add(str(p.resolve()))
            except:
                normalized.add(path)
        return normalized

    def _extract_file_names(self, text: str) -> Set[str]:
        """提取文件名"""
        matches = re.findall(self.FILE_NAME_PATTERN, text)
        return set(matches)

    def _extract_naming_patterns(self, text: str) -> List[str]:
        """提取命名规则"""
        patterns = []
        for pattern_name, keywords in self.NAMING_PATTERNS.items():
            if any(kw in text for kw in keywords):
                patterns.append(pattern_name)
        return patterns

    def _extract_operations(self, text: str) -> Set[str]:
        """提取操作类型"""
        operations = set()
        for op_name, keywords in self.OPERATION_KEYWORDS.items():
            if any(kw in text for kw in keywords):
                operations.add(op_name)
        return operations

    def _extract_quantities(self, text: str) -> List[str]:
        """提取数量信息"""
        matches = re.findall(self.QUANTITY_PATTERN, text)
        return matches

    def _extract_constraints(self, text: str) -> List[str]:
        """提取其他约束条件"""
        constraints = []

        # 条件关键词
        condition_keywords = ['如果', '当', '满足', '符合', '包含', '不包含', '大于', '小于', '等于']
        for keyword in condition_keywords:
            if keyword in text:
                # 提取包含该关键词的句子片段
                pattern = f'[^。，；]*{keyword}[^。，；]*'
                matches = re.findall(pattern, text)
                constraints.extend(matches)

        return constraints


class TaskMatcher:
    """任务匹配器"""

    def __init__(self):
        self.extractor = TaskFeatureExtractor()

    def calculate_similarity(
        self,
        current_input: str,
        history_input: str
    ) -> Tuple[float, List[TaskDifference]]:
        """
        计算两个任务的相似度，并返回差异列表

        Args:
            current_input: 当前任务输入
            history_input: 历史任务输入

        Returns:
            (相似度分数 0-1, 差异列表)
        """
        # 提取特征
        current_features = self.extractor.extract(current_input)
        history_features = self.extractor.extract(history_input)

        # 计算各维度相似度和差异
        differences = []
        scores = []

        # 1. 关键词相似度（基础权重 0.2）
        keyword_sim = self._jaccard_similarity(
            current_features.keywords,
            history_features.keywords
        )
        scores.append(('keywords', keyword_sim, 0.2))

        # 2. 文件格式相似度（权重 0.15）
        format_sim, format_diffs = self._compare_sets(
            current_features.file_formats,
            history_features.file_formats,
            'file_formats',
            '文件格式',
            'high'
        )
        scores.append(('file_formats', format_sim, 0.15))
        differences.extend(format_diffs)

        # 3. 目录路径相似度（权重 0.15）
        dir_sim, dir_diffs = self._compare_sets(
            current_features.directory_paths,
            history_features.directory_paths,
            'directory_paths',
            '目录路径',
            'critical'
        )
        scores.append(('directory_paths', dir_sim, 0.15))
        differences.extend(dir_diffs)

        # 4. 命名规则相似度（权重 0.15）
        naming_sim, naming_diffs = self._compare_lists(
            current_features.naming_patterns,
            history_features.naming_patterns,
            'naming_patterns',
            '命名规则',
            'high'
        )
        scores.append(('naming_patterns', naming_sim, 0.15))
        differences.extend(naming_diffs)

        # 5. 操作类型相似度（权重 0.2）
        op_sim, op_diffs = self._compare_sets(
            current_features.operations,
            history_features.operations,
            'operations',
            '操作类型',
            'critical'
        )
        scores.append(('operations', op_sim, 0.2))
        differences.extend(op_diffs)

        # 6. 数量信息相似度（权重 0.1）
        qty_sim, qty_diffs = self._compare_lists(
            current_features.quantities,
            history_features.quantities,
            'quantities',
            '数量',
            'medium'
        )
        scores.append(('quantities', qty_sim, 0.1))
        differences.extend(qty_diffs)

        # 7. 约束条件相似度（权重 0.05）
        constraint_sim, constraint_diffs = self._compare_lists(
            current_features.constraints,
            history_features.constraints,
            'constraints',
            '约束条件',
            'medium'
        )
        scores.append(('constraints', constraint_sim, 0.05))
        differences.extend(constraint_diffs)

        # 计算加权总分
        total_score = sum(score * weight for _, score, weight in scores)

        return total_score, differences

    def _jaccard_similarity(self, set1: Set, set2: Set) -> float:
        """计算 Jaccard 相似度"""
        if not set1 and not set2:
            return 1.0
        if not set1 or not set2:
            return 0.0

        intersection = len(set1 & set2)
        union = len(set1 | set2)
        return intersection / union if union > 0 else 0.0

    def _compare_sets(
        self,
        current: Set[str],
        history: Set[str],
        field: str,
        display_name: str,
        importance: str
    ) -> Tuple[float, List[TaskDifference]]:
        """比较两个集合，返回相似度和差异"""
        similarity = self._jaccard_similarity(current, history)
        differences = []

        # 找出差异
        only_current = current - history
        only_history = history - current

        if only_current or only_history:
            differences.append(TaskDifference(
                category=display_name,
                field=field,
                current_value=', '.join(sorted(only_current)) if only_current else '(无)',
                history_value=', '.join(sorted(only_history)) if only_history else '(无)',
                importance=importance
            ))

        return similarity, differences

    def _compare_lists(
        self,
        current: List[str],
        history: List[str],
        field: str,
        display_name: str,
        importance: str
    ) -> Tuple[float, List[TaskDifference]]:
        """比较两个列表，返回相似度和差异"""
        # 转为集合计算相似度
        current_set = set(current)
        history_set = set(history)
        similarity = self._jaccard_similarity(current_set, history_set)

        differences = []
        if current != history:
            differences.append(TaskDifference(
                category=display_name,
                field=field,
                current_value=', '.join(current) if current else '(无)',
                history_value=', '.join(history) if history else '(无)',
                importance=importance
            ))

        return similarity, differences


# 全局单例
_matcher: Optional[TaskMatcher] = None


def get_task_matcher() -> TaskMatcher:
    """获取任务匹配器单例"""
    global _matcher
    if _matcher is None:
        _matcher = TaskMatcher()
    return _matcher