#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
CSV到MD的实时同步脚本 - 文档库版本
监控CSV文件变化，自动更新document_library.md中的文档卡片内容
"""

import argparse
import csv
import html
import os
import re
import time
from pathlib import Path
from typing import List, Tuple, Optional

# 定义排序规则常量 (基于 document_library.md 的筛选器顺序)
PRODUCTS_ORDER = [
    "SeekSoul Online",
    "ATAC + RNA 双组学",
    "空间转录组",
    "3' 转录组",
    "5' + 免疫组库",
    "全序列转录组",
    "FFPE 单细胞转录组",
    "甲基化 + RNA 双组学"
]

TOPIC_ORDER = [
    "分析指南",
    "Notebooks",
    "Method",
    "FAQ"
]

ANALYSIS_TYPE_ORDER = [
    "数据整合",
    "批次效应",
    "基础分析",
    "细胞注释",
    "Peak-Gene Links分析",
    "SNP富集分析",
    "差异富集分析",
    "共定位分析",
    "绘图",
    "活性分析",
    "拷贝数变异分析",
    "拟时序分析",
    "生态位分析",
    "突变分析",
    "细胞通讯分析",
    "相关性分析"
]

def get_sort_key(row: List[str]) -> Tuple[int, int, int]:
    """
    计算文档行的排序键
    返回元组 (category_rank, sub_rank, original_index)
    
    Category Ranks:
    1: 单标签 Products 文档 (Single Product)
    2: 多标签 Products 文档 (Multi Product)
    3: 无 Products 但有 Topic 文档 (Topic Only)
    4: 仅 Analysis Type 文档 (Analysis Only)
    5: 其他 (Others)
    """
    if len(row) < 3:
        return (5, 0, 0)
        
    tags_str = row[2]
    # 处理标签：去除空白，保留非空标签
    tags = [t.strip() for t in tags_str.split(',') if t.strip()]
    
    # 分类标签
    product_tags = [t for t in tags if t in PRODUCTS_ORDER]
    topic_tags = [t for t in tags if t in TOPIC_ORDER]
    analysis_tags = [t for t in tags if t in ANALYSIS_TYPE_ORDER]
    
    # 确定类别和次级排序
    if len(product_tags) == 1:
        # Category 1: 单一 Product 标签
        # sub_rank 为该标签在 PRODUCTS_ORDER 中的索引
        return (1, PRODUCTS_ORDER.index(product_tags[0]), 0)
        
    elif len(product_tags) > 1:
        # Category 2: 多个 Product 标签
        # 放在所有单标签之后。内部排序可以根据第一个找到的 Product 标签索引，
        # 或者直接放所有 Cat 1 之后 (sub_rank=0)
        first_idx = min(PRODUCTS_ORDER.index(t) for t in product_tags)
        return (2, first_idx, 0)
        
    elif len(topic_tags) > 0:
        # Category 3: 无 Products，但有 Topic
        # 按照 Topic 顺序排序
        first_idx = min(TOPIC_ORDER.index(t) for t in topic_tags)
        return (3, first_idx, 0)
        
    elif len(analysis_tags) > 0:
        # Category 4: 仅有 Analysis Type
        # 按照 Analysis Type 顺序排序
        first_idx = min(ANALYSIS_TYPE_ORDER.index(t) for t in analysis_tags)
        return (4, first_idx, 0)
        
    else:
        # Category 5: 其他 (无相关标签)
        return (5, 0, 0)


def build_document_card(title: str, description: str, tags: str, link: str, date: str) -> str:
    """构建文档卡片HTML"""
    title_h = html.escape(title)
    description_h = html.escape(description)
    tags_h = html.escape(tags)
    link_h = html.escape(link)
    date_h = html.escape(date)
    
    # 处理标签，转换为data-tags格式
    tag_list = [tag.strip() for tag in tags.split(',') if tag.strip()]
    data_tags = ','.join(tag_list)
    
    # 构建标签显示文本
    tag_display = f"标签：{tags}" if tags else ""

    # 处理ipynb下载按钮
    download_btn = ""
    if link.lower().strip().endswith('.ipynb'):
        download_btn = (
            f'<a href="{link_h}" download style="display:inline-flex;align-items:center;padding:2px 8px;'
            'margin-left:8px;background-color:#f6f8fa;border:1px solid #d0d7de;'
            'border-radius:4px;color:#24292f;text-decoration:none;font-size:12px;">'
            '⬇ 导出 ipynb</a>'
        )
    
    # 构建卡片HTML
    return (
        '      <!-- 卡片：' + title_h + ' -->\n'
        f'      <article class="doc-card" data-tags="{data_tags}" style="border:1px solid #eee;border-radius:8px;padding:14px;box-shadow:0 1px 0 rgba(0,0,0,0.02);">\n'
        '        <div style="display:flex;justify-content:space-between;align-items:flex-start;gap:12px;">\n'
        '          <div style="flex:1;min-width:0;">\n'
        f'            <h4 style="margin:0 0 8px 0;font-size:16px;line-height:1.25;"><a href="{link_h}" target="_blank" style="color:#0b3b6f;text-decoration:none;">{title_h}</a></h4>\n'
        f'            <p style="margin:0 0 8px 0;color:#666;font-size:13px;">{description_h}</p>\n'
        f'            <div style="margin-top:6px;color:#0b3b6f;font-size:12px;" aria-hidden="true">{tag_display}{download_btn}</div>\n'
        '          </div>\n'
        f'          <div style="flex-shrink:0;color:#999;font-size:12px;">{date_h}</div>\n'
        '        </div>\n'
        '      </article>'
    )


def read_csv_rows(csv_path: str) -> List[List[str]]:
    """读取CSV文件，返回行数据"""
    rows = []
    try:
        with open(csv_path, 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            for row in reader:
                # 跳过空行
                if not row or not any(field.strip() for field in row):
                    continue

                # 确保每行有5列（标题,描述,标签,链接,日期）
                while len(row) < 5:
                    row.append('')
                rows.append(row)
    except Exception as e:
        print(f"读取CSV文件失败: {e}")
        return []
    
    # 跳过表头
    return rows[1:] if len(rows) > 1 else []


def extract_cards_section(md_text: str) -> Tuple[int, int]:
    """找到文档卡片区域的开始和结束位置"""
    # 查找文档卡片区域的开始标记
    start_marker = '    <div style="display:flex;flex-direction:column;gap:14px;">'
    start_idx = md_text.find(start_marker)
    if start_idx == -1:
        raise ValueError('未找到文档卡片区域开始标记')
    
    # 查找文档卡片区域的结束标记
    end_marker = '    </div>\n  </section>\n</div>'
    end_idx = md_text.find(end_marker, start_idx)
    if end_idx == -1:
        raise ValueError('未找到文档卡片区域结束标记')

    content_start = start_idx + len(start_marker)

    pager_idx = md_text.find('<div id="pager"', content_start, end_idx)
    if pager_idx != -1:
        line_start = md_text.rfind('\n', content_start, pager_idx)
        end_idx = (line_start + 1) if line_start != -1 else pager_idx

    return content_start, end_idx


def update_md_from_csv(csv_path: str, md_path: str, encoding: str = 'utf-8') -> bool:
    """从CSV更新MD文件"""
    try:
        # 读取CSV数据
        csv_rows = read_csv_rows(csv_path)
        if not csv_rows:
            print("CSV文件为空或读取失败")
            return False

        # 对CSV行进行排序
        # 添加原始索引以保持稳定排序 (Category Rank, Sub Rank, Original Index)
        indexed_rows = []
        for i, row in enumerate(csv_rows):
            key = get_sort_key(row)
            # 将原始索引作为第三个排序键
            sort_key = (key[0], key[1], i)
            indexed_rows.append((sort_key, row))
        
        # 执行排序
        indexed_rows.sort(key=lambda x: x[0])
        
        # 提取排序后的行
        sorted_csv_rows = [item[1] for item in indexed_rows]
        
        # 构建HTML卡片
        html_cards = []
        for row in sorted_csv_rows:
            if len(row) >= 5:
                title, description, tags, link, date = row[:5]
                html_card = build_document_card(title, description, tags, link, date)
                html_cards.append(html_card)

        # 读取MD文件
        with open(md_path, 'r', encoding=encoding) as f:
            md_text = f.read()

        # 找到文档卡片区域并替换
        start, end = extract_cards_section(md_text)
        new_cards_content = '\n' + '\n'.join(html_cards) + '\n'
        new_md = md_text[:start] + new_cards_content + md_text[end:]

        # 写回MD文件
        with open(md_path, 'w', encoding=encoding, newline='') as f:
            f.write(new_md)

        print(f"已更新 {len(html_cards)} 条文档记录到 {md_path}")
        return True

    except Exception as e:
        print(f"更新失败: {e}")
        return False


def watch_csv_file(csv_path: str, md_path: str, encoding: str = 'utf-8', interval: float = 1.0):
    """监控CSV文件变化"""
    csv_file = Path(csv_path)
    md_file = Path(md_path)
    
    if not csv_file.exists():
        print(f"CSV文件不存在: {csv_path}")
        return
    
    if not md_file.exists():
        print(f"MD文件不存在: {md_path}")
        return

    print(f"开始监控 {csv_path}")
    print(f"目标文件 {md_path}")
    print(f"检查间隔 {interval}秒")
    print("按 Ctrl+C 停止监控")
    
    last_mtime = csv_file.stat().st_mtime
    
    try:
        while True:
            try:
                current_mtime = csv_file.stat().st_mtime
                if current_mtime > last_mtime:
                    print(f"\n检测到CSV文件变化 ({time.strftime('%H:%M:%S')})")
                    print(f"文件修改时间: {current_mtime} > {last_mtime}")
                    if update_md_from_csv(csv_path, md_path, encoding):
                        last_mtime = current_mtime
                        print("更新成功，继续监控...")
                    else:
                        print("更新失败，继续监控...")
                else:
                    # 每10次检查输出一次状态（避免日志过多）
                    if int(time.time()) % 10 == 0:
                        print(f"监控中... 当前时间: {time.strftime('%H:%M:%S')}")
            except Exception as e:
                print(f"监控过程中出现错误: {e}")
                time.sleep(interval)
                continue
            
            time.sleep(interval)
            
    except KeyboardInterrupt:
        print("\n监控已停止")


def main():
    print("Script starting...")
    parser = argparse.ArgumentParser(description='CSV到MD的实时同步工具 - 文档库版本')
    parser.add_argument('--csv', required=True, help='CSV文件路径')
    parser.add_argument('--md', required=True, help='MD文件路径')
    parser.add_argument('--encoding', default='utf-8', help='文件编码，默认utf-8')
    parser.add_argument('--watch', action='store_true', help='监控模式，实时同步')
    parser.add_argument('--interval', type=float, default=1.0, help='监控间隔（秒），默认1.0')
    
    args = parser.parse_args()
    
    if args.watch:
        watch_csv_file(args.csv, args.md, args.encoding, args.interval)
    else:
        # 单次同步
        if update_md_from_csv(args.csv, args.md, args.encoding):
            print("同步完成")
        else:
            print("同步失败")


if __name__ == '__main__':
    main()
