import datetime as _dt
import html
import json
import math
import os
import re
import threading
import traceback
from collections import Counter
from pathlib import Path
from tkinter import BOTH, DISABLED, END, LEFT, NORMAL, RIGHT, StringVar, Tk, filedialog, messagebox, ttk


APP_NAME = "电脑文件夹梳理索引页生成器"
CHINESE_RE = re.compile(r"[\u4e00-\u9fff]{2,}")
WORD_RE = re.compile(r"[A-Za-z][A-Za-z0-9_\-]{2,}")

STOPWORDS = {
    "the", "and", "for", "with", "from", "this", "that", "have", "has", "are", "was", "were",
    "you", "your", "not", "but", "all", "can", "will", "into", "return", "class", "function",
    "const", "let", "var", "true", "false", "null", "none", "import", "export", "public", "private",
    "static", "void", "string", "number", "object", "array", "error", "data", "file", "path",
    "一个", "没有", "以及", "进行", "可以", "需要", "使用", "通过", "如果", "因为", "所以",
    "这个", "那个", "这些", "那些", "里面", "内容", "文件", "文件夹", "目录", "所有", "生成",
}


def human_size(size):
    units = ["B", "KB", "MB", "GB", "TB"]
    value = float(size)
    for unit in units:
        if value < 1024 or unit == units[-1]:
            return f"{value:.1f} {unit}" if unit != "B" else f"{int(value)} B"
        value /= 1024


def file_url(path):
    return Path(path).absolute().as_uri()


def split_camel_and_symbols(value):
    value = re.sub(r"([a-z])([A-Z])", r"\1 \2", value)
    return re.sub(r"[_\-.\\/]+", " ", value)


def collect_terms(text):
    counter = Counter()
    text = split_camel_and_symbols(text)
    for word in WORD_RE.findall(text):
        word = word.lower()
        if word not in STOPWORDS and not word.isdigit():
            counter[word] += 1
    for run in CHINESE_RE.findall(text):
        if run in STOPWORDS:
            continue
        if len(run) <= 6:
            counter[run] += 3
        else:
            for n in (2, 3, 4):
                for i in range(0, len(run) - n + 1):
                    token = run[i:i + n]
                    if token not in STOPWORDS:
                        counter[token] += 1
    return counter


def top_keywords(*texts, limit=12):
    counter = Counter()
    for text in texts:
        counter.update(collect_terms(text or ""))
    scored = []
    for token, count in counter.items():
        if len(token) < 2:
            continue
        bonus = 1.5 if any("\u4e00" <= ch <= "\u9fff" for ch in token) else 1.0
        scored.append((count * bonus * math.log(len(token) + 1), token))
    scored.sort(reverse=True)
    return [token for _, token in scored[:limit]]


def scan_folder(root_path, log):
    root = Path(root_path)
    files = []
    folder_counter = Counter()
    total_size = 0
    errors = []
    all_keywords = Counter()

    paths = []
    for current, dirnames, filenames in os.walk(root):
        dirnames[:] = [d for d in dirnames if d not in {".git", "node_modules", "__pycache__", ".venv", "venv"}]
        for filename in filenames:
            paths.append(Path(current) / filename)

    total = len(paths)
    log(f"发现 {total} 个文件，开始分析文件名...")

    for idx, path in enumerate(paths, start=1):
        try:
            stat = path.stat()
            rel = path.relative_to(root)
            folder = str(rel.parent) if str(rel.parent) != "." else "根目录"
            folder_counter[folder] += 1
            total_size += stat.st_size
            name_text = " ".join([path.stem, path.suffix.lower(), str(rel.parent)])
            keywords = top_keywords(name_text)
            all_keywords.update(keywords)
            files.append({
                "name": path.name,
                "relativePath": str(rel).replace("\\", "/"),
                "folder": folder.replace("\\", "/"),
                "absolutePath": str(path),
                "fileUrl": file_url(path),
                "folderUrl": file_url(path.parent),
                "extension": path.suffix.lower() or "(无扩展名)",
                "size": stat.st_size,
                "sizeText": human_size(stat.st_size),
                "modified": _dt.datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d %H:%M"),
                "keywords": keywords,
                "searchText": " ".join([path.name, str(rel), " ".join(keywords)]).lower(),
            })
        except Exception:
            errors.append(f"{path}: {traceback.format_exc(limit=1).strip()}")
        if idx % 100 == 0 or idx == total:
            log(f"已分析 {idx}/{total} 个文件")

    files.sort(key=lambda item: item["relativePath"].lower())
    return {
        "root": str(root),
        "generatedAt": _dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "fileCount": len(files),
        "folderCount": len(folder_counter),
        "totalSize": total_size,
        "totalSizeText": human_size(total_size),
        "topKeywords": [k for k, _ in all_keywords.most_common(30)],
        "topFolders": folder_counter.most_common(30),
        "errors": errors[:80],
        "files": files,
    }


HTML_TEMPLATE = r"""<!doctype html>
<html lang="zh-CN">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>文件夹索引 - __TITLE__</title>
<style>
:root{color-scheme:light;--bg:#f6f7f9;--panel:#ffffff;--ink:#18202b;--muted:#667085;--line:#d9dee7;--accent:#0f766e;--accent-2:#334155;--mark:#fff3a3}
*{box-sizing:border-box}body{margin:0;background:var(--bg);color:var(--ink);font:14px/1.55 "Microsoft YaHei UI","Segoe UI",Arial,sans-serif}
header{position:sticky;top:0;z-index:2;background:rgba(246,247,249,.96);border-bottom:1px solid var(--line);backdrop-filter:blur(8px)}
.wrap{max-width:1180px;margin:0 auto;padding:18px 18px}
.top{display:grid;grid-template-columns:1fr auto;gap:16px;align-items:end}
h1{margin:0 0 4px;font-size:24px;letter-spacing:0}.root{color:var(--muted);word-break:break-all}
.stats{display:flex;gap:10px;flex-wrap:wrap;justify-content:flex-end}.stat{background:var(--panel);border:1px solid var(--line);border-radius:8px;padding:8px 10px;min-width:96px}.stat b{display:block;font-size:18px}
.toolbar{display:grid;grid-template-columns:1fr 180px 160px;gap:10px;margin-top:14px}
input,select{width:100%;height:40px;border:1px solid var(--line);border-radius:7px;background:#fff;color:var(--ink);padding:0 12px;font:inherit}
main.wrap{padding-top:16px}.chips{display:flex;gap:8px;flex-wrap:wrap;margin-bottom:14px}.chip{border:1px solid var(--line);background:#fff;border-radius:999px;padding:5px 10px;color:var(--accent-2);cursor:pointer}.chip:hover{border-color:var(--accent)}
.layout{display:grid;grid-template-columns:260px 1fr;gap:16px}.side{align-self:start;position:sticky;top:138px;background:#fff;border:1px solid var(--line);border-radius:8px;padding:12px;max-height:calc(100vh - 160px);overflow:auto}.side h2{font-size:14px;margin:0 0 8px}.folder{display:flex;justify-content:space-between;gap:8px;padding:6px;border-radius:6px;cursor:pointer}.folder:hover{background:#eef7f6}.folder span:first-child{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}
.results-head{display:flex;justify-content:space-between;align-items:center;margin-bottom:10px;color:var(--muted)}
.file{background:#fff;border:1px solid var(--line);border-radius:8px;padding:12px;margin-bottom:10px}.file-title{display:flex;justify-content:space-between;gap:12px}.file-title a{font-size:16px;font-weight:700;color:var(--accent);text-decoration:none;word-break:break-all}.meta{color:var(--muted);font-size:12px;margin-top:3px;word-break:break-all}.keywords{display:flex;gap:6px;flex-wrap:wrap;margin-top:9px}.kw{background:#edf7f5;color:#0f5f58;border-radius:999px;padding:2px 7px;font-size:12px}.actions{display:flex;gap:8px;flex-shrink:0}.actions button,.actions a{height:30px;border:1px solid var(--line);border-radius:6px;background:#fff;color:#344054;text-decoration:none;padding:4px 8px;cursor:pointer;font:inherit;font-size:12px;white-space:nowrap}.actions button:hover,.actions a:hover{border-color:var(--accent);color:var(--accent)}mark{background:var(--mark);padding:0 1px}.empty{padding:34px;text-align:center;color:var(--muted);background:#fff;border:1px solid var(--line);border-radius:8px}
.errors{margin-top:16px;color:#7f1d1d;background:#fff7f7;border:1px solid #fecaca;border-radius:8px;padding:12px}.errors summary{cursor:pointer;font-weight:700}
@media (max-width:820px){.top,.toolbar,.layout{grid-template-columns:1fr}.stats{justify-content:flex-start}.side{position:static;max-height:220px}.file-title{display:block}.actions{margin-top:10px;flex-wrap:wrap}}
</style>
</head>
<body>
<header><div class="wrap">
<div class="top">
<div><h1>文件夹索引</h1><div class="root" id="root"></div></div>
<div class="stats"><div class="stat"><b id="fileCount"></b>文件</div><div class="stat"><b id="folderCount"></b>文件夹</div><div class="stat"><b id="totalSize"></b>总大小</div></div>
</div>
<div class="toolbar">
<input id="q" type="search" placeholder="搜索文件名、路径或关键词">
<select id="ext"><option value="">全部类型</option></select>
<select id="sort"><option value="path">按路径</option><option value="modified">按修改时间</option><option value="size">按大小</option><option value="name">按名称</option></select>
</div>
</div></header>
<main class="wrap">
<div class="chips" id="chips"></div>
<div class="layout">
<aside class="side"><h2>目录</h2><div id="folders"></div></aside>
<section>
<div class="results-head"><span id="resultCount"></span><span id="generatedAt"></span></div>
<div id="results"></div>
<details class="errors" id="errorsBox" hidden><summary>部分文件未能索引</summary><ul id="errors"></ul></details>
</section>
</div>
</main>
<script id="index-data" type="application/json">__DATA__</script>
<script>
const data = JSON.parse(document.getElementById("index-data").textContent);
const q = document.getElementById("q"), ext = document.getElementById("ext"), sort = document.getElementById("sort");
const results = document.getElementById("results");
let folderFilter = "";
document.getElementById("root").textContent = data.root;
document.getElementById("fileCount").textContent = data.fileCount;
document.getElementById("folderCount").textContent = data.folderCount;
document.getElementById("totalSize").textContent = data.totalSizeText;
document.getElementById("generatedAt").textContent = "生成于 " + data.generatedAt;
const exts = [...new Set(data.files.map(f=>f.extension))].sort();
for (const e of exts) ext.insertAdjacentHTML("beforeend", `<option value="${escapeHtml(e)}">${escapeHtml(e)}</option>`);
document.getElementById("chips").innerHTML = data.topKeywords.slice(0,18).map(k=>`<button class="chip" data-k="${escapeHtml(k)}">${escapeHtml(k)}</button>`).join("");
document.getElementById("chips").addEventListener("click", ev => { if(ev.target.dataset.k){ q.value = ev.target.dataset.k; render(); }});
document.getElementById("folders").innerHTML = `<div class="folder" data-folder=""><span>全部目录</span><b>${data.fileCount}</b></div>` + data.topFolders.map(([f,c])=>`<div class="folder" data-folder="${escapeHtml(f)}"><span title="${escapeHtml(f)}">${escapeHtml(f)}</span><b>${c}</b></div>`).join("");
document.getElementById("folders").addEventListener("click", ev => { const row = ev.target.closest(".folder"); if(row){ folderFilter = row.dataset.folder; render(); }});
if (data.errors && data.errors.length) {
  document.getElementById("errorsBox").hidden = false;
  document.getElementById("errors").innerHTML = data.errors.map(e=>`<li>${escapeHtml(e)}</li>`).join("");
}
[q, ext, sort].forEach(el => el.addEventListener("input", render));
function escapeHtml(s){return String(s).replace(/[&<>"']/g, m=>({"&":"&amp;","<":"&lt;",">":"&gt;","\"":"&quot;","'":"&#39;"}[m]));}
function highlight(text, term){
  text = escapeHtml(text || "");
  if(!term) return text;
  const safe = term.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
  return text.replace(new RegExp(safe, "ig"), m=>`<mark>${m}</mark>`);
}
function render(){
  const term = q.value.trim().toLowerCase();
  let list = data.files.filter(f => (!term || f.searchText.includes(term)) && (!ext.value || f.extension === ext.value) && (!folderFilter || f.folder === folderFilter));
  const mode = sort.value;
  list.sort((a,b)=>{
    if(mode === "modified") return b.modified.localeCompare(a.modified);
    if(mode === "size") return b.size - a.size;
    if(mode === "name") return a.name.localeCompare(b.name, "zh-Hans-CN");
    return a.relativePath.localeCompare(b.relativePath, "zh-Hans-CN");
  });
  document.getElementById("resultCount").textContent = `显示 ${list.length} / ${data.fileCount} 个文件`;
  if(!list.length){ results.innerHTML = `<div class="empty">没有匹配结果</div>`; return; }
  results.innerHTML = list.slice(0, 800).map(f => `
    <article class="file">
      <div class="file-title">
        <div><a href="${f.fileUrl}" title="打开文件">${highlight(f.name, term)}</a><div class="meta">${highlight(f.relativePath, term)} · ${f.sizeText} · ${f.modified}</div></div>
        <div class="actions"><a href="${f.folderUrl}">打开目录</a><button onclick="copyPath('${encodeURIComponent(f.absolutePath)}')">复制路径</button></div>
      </div>
      <div class="keywords">${f.keywords.map(k=>`<span class="kw">${highlight(k, term)}</span>`).join("")}</div>
    </article>`).join("") + (list.length > 800 ? `<div class="empty">结果较多，已显示前 800 条；继续输入关键词可缩小范围。</div>` : "");
}
function copyPath(encoded){ navigator.clipboard.writeText(decodeURIComponent(encoded)); }
render();
</script>
</body>
</html>
"""


def write_html_index(data, output_path):
    payload = json.dumps(data, ensure_ascii=False)
    title = html.escape(Path(data["root"]).name or "root")
    content = HTML_TEMPLATE.replace("__TITLE__", title).replace("__DATA__", payload.replace("</script", "<\\/script"))
    Path(output_path).write_text(content, encoding="utf-8")


class App:
    def __init__(self):
        self.root = Tk()
        self.root.title(APP_NAME)
        self.root.geometry("780x520")
        self.folder = StringVar()
        self.output = StringVar()
        self.running = False
        self.build_ui()

    def build_ui(self):
        frame = ttk.Frame(self.root, padding=16)
        frame.pack(fill=BOTH, expand=True)
        ttk.Label(frame, text=APP_NAME, font=("", 16, "bold")).pack(anchor="w")
        ttk.Label(frame, text="选择一个文件夹，工具会递归扫描所有层级，并生成可搜索的 HTML 索引页。").pack(anchor="w", pady=(4, 14))

        row1 = ttk.Frame(frame)
        row1.pack(fill="x", pady=5)
        ttk.Label(row1, text="扫描文件夹", width=12).pack(side=LEFT)
        ttk.Entry(row1, textvariable=self.folder).pack(side=LEFT, fill="x", expand=True, padx=6)
        ttk.Button(row1, text="选择", command=self.choose_folder).pack(side=RIGHT)

        row2 = ttk.Frame(frame)
        row2.pack(fill="x", pady=5)
        ttk.Label(row2, text="输出索引页", width=12).pack(side=LEFT)
        ttk.Entry(row2, textvariable=self.output).pack(side=LEFT, fill="x", expand=True, padx=6)
        ttk.Button(row2, text="另存为", command=self.choose_output).pack(side=RIGHT)

        actions = ttk.Frame(frame)
        actions.pack(fill="x", pady=(12, 8))
        self.start_btn = ttk.Button(actions, text="开始生成索引页", command=self.start)
        self.start_btn.pack(side=LEFT)
        ttk.Button(actions, text="退出", command=self.root.destroy).pack(side=RIGHT)

        self.progress = ttk.Progressbar(frame, mode="indeterminate")
        self.progress.pack(fill="x", pady=(4, 10))

        self.log_box = ttk.Treeview(frame, columns=("message",), show="headings", height=14)
        self.log_box.heading("message", text="运行日志")
        self.log_box.column("message", width=720, anchor="w")
        self.log_box.pack(fill=BOTH, expand=True)

    def choose_folder(self):
        folder = filedialog.askdirectory(title="选择需要扫描的文件夹")
        if folder:
            self.folder.set(folder)
            default_name = f"{Path(folder).name or 'folder'}_index.html"
            self.output.set(str(Path(folder) / default_name))

    def choose_output(self):
        initial = self.output.get() or "folder_index.html"
        path = filedialog.asksaveasfilename(
            title="保存索引页",
            initialfile=Path(initial).name,
            defaultextension=".html",
            filetypes=[("HTML 索引页", "*.html")],
        )
        if path:
            self.output.set(path)

    def log(self, message):
        def append():
            self.log_box.insert("", END, values=(message,))
            self.log_box.yview_moveto(1)
        self.root.after(0, append)

    def start(self):
        if self.running:
            return
        folder = self.folder.get().strip()
        output = self.output.get().strip()
        if not folder or not Path(folder).is_dir():
            messagebox.showwarning(APP_NAME, "请先选择一个有效的扫描文件夹。")
            return
        if not output:
            output = str(Path(folder) / f"{Path(folder).name or 'folder'}_index.html")
            self.output.set(output)
        self.running = True
        self.start_btn.config(state=DISABLED)
        self.progress.start(12)
        threading.Thread(target=self.worker, args=(folder, output), daemon=True).start()

    def worker(self, folder, output):
        try:
            self.log(f"扫描目录：{folder}")
            data = scan_folder(folder, self.log)
            self.log("正在写入索引页...")
            write_html_index(data, output)
            self.log(f"完成：{output}")
            self.root.after(0, lambda: messagebox.showinfo(APP_NAME, f"索引页已生成：\n{output}"))
            try:
                os.startfile(output)
            except Exception:
                pass
        except Exception as exc:
            self.log(str(exc))
            self.root.after(0, lambda: messagebox.showerror(APP_NAME, f"生成失败：\n{exc}"))
        finally:
            self.root.after(0, self.finish)

    def finish(self):
        self.running = False
        self.progress.stop()
        self.start_btn.config(state=NORMAL)

    def run(self):
        self.root.mainloop()


if __name__ == "__main__":
    App().run()
