#!/usr/bin/env python3
"""
Hermes 文件索引器 v2
扫描 ~/files/ 目录（追踪软链），索引文本文件到 ChromaDB
增量更新：基于文件 MD5 哈希检测变更
"""

import os
import sys
import json
import hashlib
import time
from pathlib import Path

# 国内网络镜像（如不可用则注释掉）
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

FILES_ROOT = Path('/home/ubuntu/files')
CHROMA_DIR = Path('/home/ubuntu/files/chroma_db')
STATUS_FILE = Path('/home/ubuntu/files/chroma_status.json')

EXCLUDE_DIRS = {
    '.git', '__pycache__', 'node_modules',
    'venv', '.venv', 'chroma_db',
    'hermes-agent',
}

TEXT_EXTS = {
    '.md', '.txt', '.py', '.js', '.ts', '.jsx', '.tsx',
    '.json', '.yaml', '.yml', '.toml', '.ini', '.cfg',
    '.html', '.css', '.scss', '.less', '.xml',
    '.sh', '.bash', '.zsh',
    '.cpp', '.c', '.h', '.hpp', '.java', '.go', '.rs', '.rb',
    '.sql', '.r', '.swift', '.kt',
}

SKIP_EXTS = {
    '.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.bmp', '.ico',
    '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
    '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar',
    '.mp3', '.mp4', '.avi', '.mov', '.wav', '.flac',
    '.exe', '.dll', '.so', '.dylib',
    '.db', '.sqlite',
    '.lock', '.log',
}


def get_file_hash(path):
    """快速 MD5（仅读前 64KB）"""
    hasher = hashlib.md5()
    with open(path, 'rb') as f:
        hasher.update(f.read(65536))
    return hasher.hexdigest()


def chunk_text(text, filename, chunk_size=512, overlap=64):
    """按段落分块，支持重叠"""
    import re
    paragraphs = re.split(r'\n\s*\n', text)
    chunks = []
    current = []
    current_len = 0

    for para in paragraphs:
        para = para.strip()
        if not para:
            continue
        para_len = len(para)

        if current_len + para_len > chunk_size and current:
            chunk_text = '\n\n'.join(current)
            chunks.append({
                'text': chunk_text,
                'source': filename,
                'chunk_id': f"{filename}#chunk{len(chunks)}"
            })
            overlap_text = current[-1] if current else ''
            current = [overlap_text] if overlap_text else []
            current_len = len(overlap_text)

            if para_len > chunk_size:
                chunks.append({
                    'text': para,
                    'source': filename,
                    'chunk_id': f"{filename}#chunk{len(chunks)}"
                })
                continue

        current.append(para)
        current_len += para_len

    if current:
        chunk_text = '\n\n'.join(current)
        chunks.append({
            'text': chunk_text,
            'source': filename,
            'chunk_id': f"{filename}#chunk{len(chunks)}"
        })

    return chunks


def scan_files_follow_links(root_dir):
    """用 os.walk(followlinks=True) 追踪软链"""
    files = []
    for dirpath, dirnames, filenames in os.walk(root_dir, followlinks=True):
        rel = os.path.relpath(dirpath, root_dir)
        parts = rel.replace('\\', '/').split('/')
        if any(excl in parts for excl in EXCLUDE_DIRS):
            dirnames.clear()
            continue
        if any(p.startswith('.') and p not in {'.', '..'} for p in parts):
            continue
        for fname in filenames:
            ext = os.path.splitext(fname)[1].lower()
            if ext in TEXT_EXTS:
                files.append(os.path.join(dirpath, fname))
    return files


def main():
    from sentence_transformers import SentenceTransformer
    import chromadb
    from chromadb.config import Settings

    print(f"🚀 Hermes 文件索引器 v2 启动")
    print(f"📂 扫描目录: {FILES_ROOT}")

    # 加载上次状态
    old_status = {}
    if STATUS_FILE.exists():
        with open(STATUS_FILE) as f:
            old_status = json.load(f)
    print(f"📊 上次索引了 {len(old_status)} 个文件")

    # 扫描文件
    files = scan_files_follow_links(FILES_ROOT)
    print(f"📄 找到 {len(files)} 个文本文件")

    # 检查哈希，找变更文件
    changed_files = []
    unchanged_count = 0
    for fpath in files:
        try:
            rel = os.path.relpath(fpath, FILES_ROOT)
            h = get_file_hash(fpath)
            if rel not in old_status or old_status[rel] != h:
                changed_files.append(fpath)
            else:
                unchanged_count += 1
        except Exception as e:
            print(f"  ⚠️  跳过 {fpath}: {e}")

    print(f"🆕 新增/变更: {len(changed_files)} 个")
    print(f"⏭️  未变化: {unchanged_count} 个")

    if not changed_files:
        print("✅ 没有需要更新的文件")
        return

    # 加载模型
    print("🧠 加载嵌入模型...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    model_dim = model.get_embedding_dimension()
    print(f"   模型维度: {model_dim}")

    # 连接 / 创建 ChromaDB
    print("🗄️  连接向量数据库...")
    CHROMA_DIR.mkdir(parents=True, exist_ok=True)
    client = chromadb.PersistentClient(
        path=str(CHROMA_DIR),
        settings=Settings(anonymized_telemetry=False)
    )

    collection_name = 'hermes_files'
    try:
        collection = client.get_collection(collection_name)
        count = collection.count()
        if count > 0:
            print(f"   已有集合，当前 {count} 条")
        else:
            collection.delete(where={})
            print(f"   清空旧数据")
    except:
        collection = client.create_collection(
            name=collection_name,
            metadata={"hnsw:space": "cosine"}
        )
        print(f"   创建新集合")

    # 增量索引
    total_chunks = 0
    new_status = dict(old_status)
    batch_texts = []
    batch_metadatas = []
    batch_ids = []
    batch_size = 32

    def flush_batch():
        nonlocal batch_texts, batch_metadatas, batch_ids
        if not batch_texts:
            return
        embeddings = model.encode(batch_texts, show_progress_bar=False)
        collection.add(
            embeddings=embeddings.tolist(),
            documents=batch_texts,
            metadatas=batch_metadatas,
            ids=batch_ids
        )
        batch_texts = []
        batch_metadatas = []
        batch_ids = []

    for i, fpath in enumerate(changed_files):
        rel = os.path.relpath(fpath, FILES_ROOT)
        print(f"  [{i+1}/{len(changed_files)}] {rel}", end=' ')
        sys.stdout.flush()

        try:
            with open(fpath, 'r', encoding='utf-8', errors='replace') as f:
                text = f.read()
            if len(text) < 20:
                print("⏭️  太短")
                continue

            chunks = chunk_text(text, rel)
            if not chunks:
                print("⏭️  无内容")
                continue

            # 删除旧索引
            try:
                collection.delete(where={"source": rel})
            except:
                pass

            for c in chunks:
                batch_texts.append(c['text'])
                batch_metadatas.append({
                    'source': c['source'],
                    'chunk_id': c['chunk_id']
                })
                batch_ids.append(c['chunk_id'])
                total_chunks += 1

            if len(batch_texts) >= batch_size:
                flush_batch()

            new_status[rel] = get_file_hash(fpath)
            print(f"✅ {len(chunks)} 块")

        except Exception as e:
            print(f"❌ {e}")

    flush_batch()

    # 清理已删除的文件
    try:
        existing = collection.get(include=[], limit=100000)
        if existing and existing.get('metadatas'):
            existing_sources = set(m['source'] for m in existing['metadatas'])
            deleted = existing_sources - set(new_status.keys())
            for src in deleted:
                collection.delete(where={"source": src})
                print(f"🗑️  清理: {src}")
    except:
        pass

    # 保存状态
    with open(STATUS_FILE, 'w') as f:
        json.dump(new_status, f, indent=2)

    print(f"\n✅ 索引完成!")
    print(f"   总文件: {len(new_status)}")
    print(f"   总块数: {total_chunks}")
    print(f"   集合总量: {collection.count()} 条")


if __name__ == '__main__':
    main()