First Commit

8edf8925 · zhengzifu · 8edf8925 · 8edf8925 · 8edf8925 · 8edf8925
Commit 8edf8925 authored Mar 28, 2025 by zhengzifu
71 changed files
--- a/.gitignore
+++ b/.gitignore
+__pycache__
+.DS_Store
+.venv
+.vscode
+outputs
+outputs-qwen
+weights
+model.safetensors
+001-H-LLM
+build
+src-Optimize_HN
+*.egg-info
+*.egg
+*.so
--- a/.python-version
+++ b/.python-version
+3.11
--- a/archived/generate_HN.py
+++ b/archived/generate_HN.py
+# %%
+import sys
+import numpy as np
+from pyrilog import (
+    VerilogGenerator,
+    ModuleBlock,
+    add_parameter,
+    add_input,
+    add_output,
+    add_assign,
+)
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import os
+from config import CFG
+from tqdm import tqdm
+import pickle
+import json
+
+path_dir = "Optimized_HN"
+
+
+def calculate_WW(matrix: np.array, value_range):
+    WW = [0] * len(value_range)
+    for i in range(len(value_range)):
+        WW[i] = max(
+            [len([x for x in row if abs(x - value_range[i]) <= 0.01]) for row in matrix]
+        )
+    return WW
+
+
+def find_index(arr, target, epsilon=1e-3):
+    arr = np.array(arr)  # 转换为numpy数组
+    diff = np.abs(arr - target)  # 计算差值数组
+    min_diff = np.min(diff)  # 找到最小的差值
+    if min_diff < epsilon:  # 如果最小差值在允许的误差范围内
+        return np.where(diff == min_diff)[0][0]  # 返回第一个匹配的索引
+    raise ValueError("No match found")  # 如果没有找到匹配项，则引发异常
+
+
+# %%
+def generate_module(
+    matrix,
+    module_name="HN",
+    H=16,
+    L=5,
+    value_range=[-1, 1],
+    WW=[8, 8],
+    CUR_VN=0,
+    weights_file_name="",
+):
+    # with VerilogGenerator() as generator:
+    with ModuleBlock(f"{module_name}") as module:
+        add_parameter("H", H)
+        add_parameter("L", L)
+        # for i in range(len(value_range)):
+        #     add_parameter(f"WW_{i}", WW[i])
+
+        add_input("HN_in", "H")
+        add_input("CST_LOW")
+        node_file = os.path.join(
+            "output", weights_file_name, f"{weights_file_name}_vn_{CUR_VN}.json"
+        )
+        node = json.load(open(node_file))["node"]
+        # max_L=0
+        # 内部连线
+        for i, hn_in_layers in enumerate(node):
+            color_file = os.path.join(
+                "output",
+                weights_file_name,
+                f"{weights_file_name}_vn_{CUR_VN}_value_{value_range[i]}.json",
+            )
+            mux_port = json.load(open(color_file))["color"]
+            max_mux_port = max(mux_port) + 1
+            add_parameter(f"WW_{i}", max_mux_port)
+            max_L = max(mux_port.count(x) for x in set(mux_port) if x != -1)
+            add_output(
+                f"HN_out_{i}",
+                f"WW_{i}",
+                f"{max_L}",
+            )
+            # 第一维是该颜色的使用次数 第二维是染的颜色即mux_port
+            hn_out = [[-1 for _ in range(max_mux_port)] for _ in range(max_L)]
+            used_mux_port = [0 for _ in range(max_mux_port)]
+            for j, hn_in_layer in enumerate(hn_in_layers):
+                if mux_port[j] == -1:
+                    continue
+                hn_out[used_mux_port[mux_port[j]]][mux_port[j]] = j
+                used_mux_port[mux_port[j]] += 1
+            for j in range(max_L):
+                for k in range(max_mux_port):
+                    if hn_out[j][k] == -1:
+                        add_assign(f"HN_out_{i}", [j, k], "CST_LOW", [])
+                    else:
+                        add_assign(f"HN_out_{i}", [j, k], "HN_in", [hn_out[j][k]])
+
+    return module
+
+
+# %%
+def process_task(i, weights_file_name, matrix, H, L):
+    try:
+        WW = calculate_WW(matrix, CFG.value_range)
+        file_dir = os.path.join(path_dir, weights_file_name)
+        os.makedirs(file_dir, exist_ok=True)
+        file_name = os.path.join(
+            file_dir, f"{path_dir}_tp_{weights_file_name}_vc_{i}.sv"
+        )
+        with open(file_name, "w") as f:
+            text = generate_module(
+                matrix,
+                module_name=f"{path_dir}_tp_{weights_file_name}_vc_{i}",
+                H=H,
+                L=L,
+                value_range=CFG.value_range,
+                WW=WW,
+                CUR_VN=i,
+                weights_file_name=weights_file_name,
+            ).generate()
+            f.write(text)
+        return i  # 返回任务ID以显示进度
+    except Exception as e:
+        print(f"Generating {i} failed with an error: {e}")
+        return None
+
+
+def run():
+    os.makedirs(path_dir, exist_ok=True)
+    for weights_file in os.listdir("mapped_weights"):
+        if weights_file != CFG.run_weights:
+            continue
+        weights_path = os.path.join("mapped_weights", weights_file)
+        weights_file_name = os.path.splitext(weights_file)[0]
+        print(f"Processing {weights_file_name}")
+        with open(weights_path, "rb") as f:
+            print(f"Loading {weights_file_name}")
+            matrixs = pickle.load(f)
+        VN, L, H = matrixs.shape
+        with ProcessPoolExecutor(max_workers=CFG.num_workers) as executor:
+            futures = [
+                executor.submit(process_task, i, weights_file_name, matrixs[i], H, L)
+                for i in range(VN)
+            ]
+            for future in tqdm(as_completed(futures), total=VN):
+                try:
+                    result = future.result()
+                except Exception as e:
+                    print(f"Generating {result} failed with an error: {e}")
+
+
+if __name__ == "__main__":
+    run()
+# %%
--- a/archived/generate_fsm_0327.py
+++ b/archived/generate_fsm_0327.py
--- a/archived/generate_weights.py
+++ b/archived/generate_weights.py
+import numpy as np
+import pickle
+import os
+import sys
+from config import CFG
+
+if CFG.mode == "run":
+    exit
+
+
+def run():
+    shape = CFG.test_weights_shape
+    weights = np.random.choice(CFG.value_range, shape)
+    filename_pkl = os.path.join(CFG.weights_dir, CFG.test_weights)
+    filename_txt = os.path.join(
+        CFG.weights_dir, CFG.test_weights.split(".")[0] + ".txt"
+    )
+    with open(filename_pkl, "wb") as f:
+        pickle.dump(weights, f)
+    with open(filename_txt, "w") as f:
+        for row in weights:
+            for val in row:
+                f.write(f"{val}\n")
+            f.write("\n\n")
--- a/archived/makefile
+++ b/archived/makefile
+CXX = g++
+CXXFLAGS = -std=c++11 -Wall -pthread
+
+TARGET = optimize_HN
+SRC = optimize_HN.cpp
+
+$(TARGET): $(SRC)
+	$(CXX) $(CXXFLAGS) -o $(TARGET) $(SRC)
+
+.PHONY: clean
+clean:
+	rm -f $(TARGET)
--- a/archived/optimize_HN.cpp.bak
+++ b/archived/optimize_HN.cpp.bak
--- a/archived/pyrilog.py
+++ b/archived/pyrilog.py
--- a/archived/run_all.py
+++ b/archived/run_all.py
+import os
+import sys
+import pickle
+from config import CFG
+import generate_ww
+import generate_sub_wrapper
+import generate_lm
+import generate_wt_group
+import generate_wrappers
+import generate_mid_wrapper
+import generate_wallace
+import generate_fsm
+import generate_hn
+
+
+def run_generate_verilog():
+    generate_ww.run()
+    print("生成WW完成")
+    generate_sub_wrapper.run()
+    print("生成Sub_wrapper完成")
+    generate_hn.run()
+    print("生成HN完成")
+    generate_lm.run()
+    print("生成Layer_mux完成")
+    generate_wt_group.run()
+    print("生成WT_group完成")
+    generate_wrappers.run()
+    print("生成Wrappers完成")
+    generate_mid_wrapper.run()
+    print("生成Mid_wrappers完成")
+    generate_wallace.run()
+    print("生成Wallace Tree完成")
+    generate_fsm.run()
+    print("生成FSM完成")
+
+
+def print_menu():
+    print("\n可用的生成选项：")
+    print("1. 生成 WW")
+    print("2. 生成 Sub_wrapper")
+    print("3. 生成 HN")
+    print("4. 生成 Layer_mux")
+    print("5. 生成 WT_group")
+    print("6. 生成 Wrappers")
+    print("7. 生成 Mid_wrappers")
+    print("8. 生成 Wallace Tree")
+    print("9. 生成 FSM")
+    print("10. 生成全部")
+    print("0. 退出")
+    return input("请选择要生成的模块 (0-10): ")
+
+
+def run_selected_generate(choice):
+    if choice == "1":
+        generate_ww.run()
+        print("生成WW完成")
+    elif choice == "2":
+        generate_sub_wrapper.run()
+        print("生成Sub_wrapper完成")
+    elif choice == "3":
+        generate_hn.run()
+        print("生成HN完成")
+    elif choice == "4":
+        generate_lm.run()
+        print("生成Layer_mux完成")
+    elif choice == "5":
+        generate_wt_group.run()
+        print("生成WT_group完成")
+    elif choice == "6":
+        generate_wrappers.run()
+        print("生成Wrappers完成")
+    elif choice == "7":
+        generate_mid_wrapper.run()
+        print("生成Mid_wrappers完成")
+    elif choice == "8":
+        generate_wallace.run()
+        print("生成Wallace Tree完成")
+    elif choice == "9":
+        generate_fsm.run()
+        print("生成FSM完成")
+    elif choice == "10":
+        run_generate_verilog()
+
+
+if __name__ == "__main__":
+    print("请选择运行模式：")
+    print(
+        f"1. 使用 run_weights_batch（批量运行）, 当前权重文件：{CFG.run_weights_batch}"
+    )
+    print(f"2. 使用 run_weights（单次运行）, 当前权重文件：{CFG.run_weights}")
+    mode = input("请选择 (1/2): ")
+
+    if mode == "1":
+        while True:
+            choice = print_menu()
+            if choice == "0":
+                break
+            for weights in CFG.run_weights_batch:
+                print(f"\n正在处理 weights: {weights}")
+                CFG.run_weights = weights
+                run_selected_generate(choice)
+    elif mode == "2":
+        while True:
+            choice = print_menu()
+            if choice == "0":
+                break
+            run_selected_generate(choice)
+    else:
+        print("无效的选择！")
--- a/archived/setup.py
+++ b/archived/setup.py
+from setuptools import setup, Extension
+import pybind11
+
+ext_modules = [
+    Extension(
+        'optimize_HN',
+        ['optimize_HN.cpp'],
+        include_dirs=[pybind11.get_include()],
+        language='c++',
+        extra_compile_args=['-std=c++11'],
+        extra_link_args=['-static-libstdc++'],
+    ),
+]
+
+setup(
+    name='optimize_HN',
+    version='0.1',
+    ext_modules=ext_modules,
+)
\ No newline at end of file
--- a/archived/simulate_origin_hn.py
+++ b/archived/simulate_origin_hn.py
+# %%
+from multiprocessing import Pool
+import pickle
+import numpy as np
+from tqdm import tqdm
+from prettytable import PrettyTable
+import os
+from concurrent.futures import ProcessPoolExecutor
+from hllm.config import CFG
+from hllm.utils import calculate_WW, find_index
+
+
+# 返回第i位
+def get_bit(num, i):
+    if i < 0:
+        return 0
+    return (num >> i) & 1
+
+
+# %%
+class HN:
+    def __init__(self, matrix, H, L):
+        self.matrix = matrix
+        self.H = H
+        self.L = L
+
+    # def find_index(self, value):
+    #     return np.searchsorted(CFG.value_range, value)
+
+    def calculate(self, HN_in: np.ndarray):
+        HN_out = np.zeros((self.L, len(CFG.value_range)), dtype=int)
+        ans = np.zeros(self.L)
+        matrix_masked = self.matrix * HN_in
+        for i, layer in enumerate(matrix_masked):
+            for j, value in enumerate(layer):
+                if abs(value) <= 1e-3:
+                    continue
+                index = find_index(CFG.value_range, value)
+                HN_out[i][index] += 1
+                ans[i] += value
+            # indices=list(map(self.find_index,layer))
+            # np.add.at(HN_out[i],indices,1)
+        return HN_out, ans
+
+
+class HN_GROUP:
+    def __init__(self, weights: np.ndarray):
+        self.VN, self.L, self.H = weights.shape
+        print(weights.shape)
+        self.HN_GROUP = [HN(matrix, self.H, self.L) for matrix in weights]
+        print("HN_GROUP init done")
+
+    def calculate_single(self, hn_group, hn_in):
+        return hn_group.calculate(hn_in)
+
+    def calculate(self, hn_in: np.ndarray):
+        hn_out = [None] * len(self.HN_GROUP)
+        ans = [None] * len(self.HN_GROUP)
+
+        with ProcessPoolExecutor(max_workers=CFG.num_workers) as executor:
+            futures = [
+                executor.submit(hn_group.calculate, hn_in) for hn_group in self.HN_GROUP
+            ]
+
+            for i, future in enumerate(tqdm(futures)):
+                hn_out[i], ans[i] = future.result()
+        ans = np.array(ans)
+        return ans
+
+
+def run(config: CFG):
+    activation_name = os.path.join(config.verify_dir, "activation.pkl")
+    result_name = os.path.join(config.verify_dir, "result.txt")
+    with open(activation_name, "rb") as f:
+        hn_in = pickle.load(f)
+    weights_path = os.path.join(config.verify_dir, config.verify_weights)
+    with open(weights_path, "rb") as f:
+        matrixs = pickle.load(f)
+        matrixs = np.transpose(matrixs, (1, 0, 2))
+
+    hn_group = HN_GROUP(matrixs)
+
+    hn_in = get_bit(hn_in, 7)
+
+    hn_out = hn_group.calculate(hn_in)
+    print(hn_out)
--- a/archived/test_optimize_HN.py
+++ b/archived/test_optimize_HN.py
+import hllm.optimized.turbo_optimize_hn as turbo_optimize_hn
+import numpy as np
+
+
+def test_graph_coloring():
+    # 创建一个100x100的随机稀疏邻接矩阵
+    np.random.seed(41)  # 为了结果可重复，设置随机种子
+    size = 1536
+    weight = np.random.randint(-1, 2, size=(size, size))
+    adj_matrix = np.random.randint(0, 2, size=(size, size))
+    adj_matrix = np.triu(adj_matrix, 1)  # 只保留上三角部分
+    adj_matrix += adj_matrix.T  # 对称化矩阵
+
+    print("turbo_optimize_hn 模块位置:", turbo_optimize_hn.__file__)
+
+    # 调用图着色算法
+    colors = turbo_optimize_hn.greedy_coloring(adj_matrix, weight)
+
+    # 打印结果
+    # print("节点颜色分配结果:", colors)
+    print(max(colors))
+
+    # 验证结果是否有效
+    n = len(adj_matrix)
+    for i in range(n):
+        for j in range(n):
+            if adj_matrix[i][j] == 1:
+                # 相邻节点不应该有相同的颜色
+                assert (
+                    colors[i] != colors[j] and colors[i] != -1 and colors[j] != -1
+                ), f"相邻节点 {i} 和 {j} 具有相同的颜色！"
+
+    print("测试通过！所有相邻节点都有不同的颜色")
+
+
+if __name__ == "__main__":
+    test_graph_coloring()
--- a/archived/没用的/generate_hn.ipynb
+++ b/archived/没用的/generate_hn.ipynb
+{
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from pyrilog import VerilogGenerator,ModuleBlock,add_parameter,add_input,add_output,add_assign\n",
+    "from concurrent.futures import ProcessPoolExecutor, as_completed\n",
+    "import os\n",
+    "from tqdm import tqdm\n",
+    "import pickle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class CFG:\n",
+    "    path_dir = \"HN\"\n",
+    "    value_range = [-6, -4, -3, -2, -1.5, -1, -0.5, 0.5, 1, 1.5, 2, 3, 4, 6]\n",
+    "    WW = [8] * len(value_range)\n",
+    "\n",
+    "\n",
+    "def calculate_WW(matrix: np.array, value_range):\n",
+    "    WW = [0] * len(value_range)\n",
+    "    for i in range(len(value_range)):\n",
+    "        WW[i] = max([len([x for x in row if abs(x-value_range[i])<=0.01]) for row in matrix])\n",
+    "    return WW\n",
+    "\n",
+    "def find_index(arr, target, epsilon=1e-3):\n",
+    "    arr = np.array(arr)  # 转换为numpy数组\n",
+    "    diff = np.abs(arr - target)  # 计算差值数组\n",
+    "    min_diff = np.min(diff)  # 找到最小的差值\n",
+    "    if min_diff < epsilon:  # 如果最小差值在允许的误差范围内\n",
+    "        return np.where(diff == min_diff)[0][0]  # 返回第一个匹配的索引\n",
+    "    raise ValueError('No match found')  # 如果没有找到匹配项，则引发异常"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_verilog_code(\n",
+    "    matrix,\n",
+    "    HN_id=0,\n",
+    "    H=16,\n",
+    "    L=5,\n",
+    "    value_range=[-1, 1],\n",
+    "    WW=[8,8],\n",
+    "):\n",
+    "    with VerilogGenerator() as generator:\n",
+    "        with ModuleBlock(f\"HN_{HN_id}\"):\n",
+    "            add_parameter(\"H\", H)\n",
+    "            add_parameter(\"L\", L)\n",
+    "            for i in range(len(value_range)):\n",
+    "                add_parameter(f\"WW_{i}\", WW[i])\n",
+    "            add_input(name=\"HN_in\", width=\"H\")\n",
+    "            for i in range(len(value_range)):\n",
+    "                add_output(\n",
+    "                    name=f\"HN_out_{i}\",\n",
+    "                    width=f\"WW_{i}\",\n",
+    "                    height=\"L\",\n",
+    "                )\n",
+    "            # 内部连线\n",
+    "            for i, layer in enumerate(matrix):\n",
+    "                weight_cnt = [0] * len(value_range)\n",
+    "                for j, weight in enumerate(layer):\n",
+    "                    if abs(weight)<1e-3:\n",
+    "                        continue\n",
+    "                    try:\n",
+    "                        index=find_index(value_range, weight)\n",
+    "                    except ValueError:\n",
+    "                        print(f\"weight {weight} not found\")\n",
+    "                        continue\n",
+    "                    add_assign(\n",
+    "                        f\"HN_out_{index}\",\n",
+    "                        [i, weight_cnt[index]],\n",
+    "                        \"HN_in\",\n",
+    "                        [j],\n",
+    "                    )\n",
+    "                    weight_cnt[index] += 1\n",
+    "                for i in range(len(weight_cnt)):\n",
+    "                    while weight_cnt[i] < WW[i]:\n",
+    "                        add_assign(f\"HN_out_{i}\", [i, weight_cnt[i]], \"0\", [])\n",
+    "                        weight_cnt[i] += 1\n",
+    "    return generator.generate()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 0/1536 [00:00<?, ?it/s]\n"
+     ]
+    },
+    {
+     "ename": "NameError",
+     "evalue": "name 'result' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mBrokenProcessPool\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[24], line 36\u001b[0m\n\u001b[0;32m     34\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m     35\u001b[0m     \u001b[38;5;66;03m# print(1)\u001b[39;00m\n\u001b[1;32m---> 36\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[43mfuture\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     37\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n",
+      "File \u001b[1;32m~\\AppData\\Roaming\\uv\\python\\cpython-3.11.10-windows-x86_64-none\\Lib\\concurrent\\futures\\_base.py:449\u001b[0m, in \u001b[0;36mFuture.result\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m    448\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;241m==\u001b[39m FINISHED:\n\u001b[1;32m--> 449\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__get_result\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    451\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_condition\u001b[38;5;241m.\u001b[39mwait(timeout)\n",
+      "File \u001b[1;32m~\\AppData\\Roaming\\uv\\python\\cpython-3.11.10-windows-x86_64-none\\Lib\\concurrent\\futures\\_base.py:401\u001b[0m, in \u001b[0;36mFuture.__get_result\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    400\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 401\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception\n\u001b[0;32m    402\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[0;32m    403\u001b[0m     \u001b[38;5;66;03m# Break a reference cycle with the exception in self._exception\u001b[39;00m\n",
+      "\u001b[1;31mBrokenProcessPool\u001b[0m: A process in the process pool was terminated abruptly while the future was running or pending.",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[24], line 38\u001b[0m\n\u001b[0;32m     36\u001b[0m     result \u001b[38;5;241m=\u001b[39m future\u001b[38;5;241m.\u001b[39mresult()\n\u001b[0;32m     37\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m---> 38\u001b[0m     \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGenerating \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[43mresult\u001b[49m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m failed with an error: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[1;31mNameError\u001b[0m: name 'result' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "def process_task(i, matrix, H, L):\n",
+    "    try:\n",
+    "        WW = calculate_WW(matrix, CFG.value_range)\n",
+    "        file_name = os.path.join(CFG.path_dir, f\"HN_{i}.sv\")\n",
+    "        with open(file_name, \"w\") as f:\n",
+    "            f.write(\n",
+    "                generate_verilog_code(\n",
+    "                    matrix,\n",
+    "                    HN_id=i,\n",
+    "                    H=H,\n",
+    "                    L=L,\n",
+    "                    value_range=CFG.value_range,\n",
+    "                    WW=WW,\n",
+    "                )\n",
+    "            )\n",
+    "        return i  # 返回任务ID以显示进度\n",
+    "    except Exception as e:\n",
+    "        print(f\"Generating {i} failed with an error: {e}\")\n",
+    "        return None\n",
+    "\n",
+    "\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    os.makedirs(CFG.path_dir, exist_ok=True)\n",
+    "    with open(r\"C:\\Users\\night\\Documents\\Codes\\H-LLM\\weights\\q_proj.pkl\", \"rb\") as f:\n",
+    "        matrixs = pickle.load(f)\n",
+    "        matrixs = np.transpose(matrixs, (1, 0, 2))\n",
+    "    VN, L, H = matrixs.shape\n",
+    "    with ProcessPoolExecutor(max_workers=8) as executor:\n",
+    "        futures = [\n",
+    "            executor.submit(process_task, i, matrixs[i], H, L) for i in range(VN)\n",
+    "        ]\n",
+    "        for future in tqdm(as_completed(futures), total=VN):\n",
+    "            try:\n",
+    "                # print(1)\n",
+    "                result = future.result()\n",
+    "            except Exception as e:\n",
+    "                print(f\"Generating {result} failed with an error: {e}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/archived/没用的/generate_lm.ipynb
+++ b/archived/没用的/generate_lm.ipynb
--- a/archived/没用的/generate_sw.ipynb
+++ b/archived/没用的/generate_sw.ipynb
+{
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from pyrilog import (\n",
+    "    VerilogGenerator,\n",
+    "    ModuleBlock,\n",
+    "    add_parameter,\n",
+    "    add_input,\n",
+    "    add_output,\n",
+    "    add_assign,\n",
+    "    add_wire,\n",
+    "    add_instance,\n",
+    ")\n",
+    "from concurrent.futures import ProcessPoolExecutor, as_completed\n",
+    "import os\n",
+    "from tqdm import tqdm\n",
+    "import pickle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class CFG:\n",
+    "    path_dir = \"Sub_wrapper\"\n",
+    "    weights_dir = \"../001-H-LLM/weights\"\n",
+    "    num_workers = 16\n",
+    "    value_range = [-6, -4, -3, -2, -1.5, -1, -0.5, 0.5, 1, 1.5, 2, 3, 4, 6]\n",
+    "\n",
+    "\n",
+    "def calculate_WW(matrix: np.array, value_range):\n",
+    "    WW = [0] * len(value_range)\n",
+    "    for i in range(len(value_range)):\n",
+    "        WW[i] = max(\n",
+    "            [len([x for x in row if abs(x - value_range[i]) <= 0.01]) for row in matrix]\n",
+    "        )\n",
+    "    return WW\n",
+    "\n",
+    "\n",
+    "def find_index(arr, target, epsilon=1e-3):\n",
+    "    arr = np.array(arr)  # 转换为numpy数组\n",
+    "    diff = np.abs(arr - target)  # 计算差值数组\n",
+    "    min_diff = np.min(diff)  # 找到最小的差值\n",
+    "    if min_diff < epsilon:  # 如果最小差值在允许的误差范围内\n",
+    "        return np.where(diff == min_diff)[0][0]  # 返回第一个匹配的索引\n",
+    "    raise ValueError(\"No match found\")  # 如果没有找到匹配项，则引发异常"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_module(\n",
+    "    matrix,\n",
+    "    module_name_suffix=\"\",\n",
+    "    H=16,\n",
+    "    L=5,\n",
+    "    value_range=[-1, 1],\n",
+    "    WW=[8, 8],\n",
+    "):\n",
+    "    with ModuleBlock(f\"{CFG.path_dir}_{module_name_suffix}\") as module:\n",
+    "        # 参数\n",
+    "        add_parameter(\"H\", H)\n",
+    "        add_parameter(\"L\", L)\n",
+    "        for i in range(len(value_range)):\n",
+    "            add_parameter(f\"WW_{i}\", WW[i])\n",
+    "        # 输入输出\n",
+    "        add_input(\"clk\")\n",
+    "        add_input(\"tre_rstn\")\n",
+    "        add_input(\"valid\")\n",
+    "        add_input(\"LM_sel\", \"L\")\n",
+    "        add_input(\"SW_in\",\"H\")\n",
+    "        for i in range(len(value_range)):\n",
+    "            add_output(\n",
+    "                name=f\"WT_{i}_out_S\",\n",
+    "            )\n",
+    "            add_output(\n",
+    "                name=f\"WT_{i}_out_C\",\n",
+    "            )\n",
+    "        # 内部连线\n",
+    "        for i in range(len(value_range)):\n",
+    "            add_wire(\n",
+    "                name=f\"HN_out_{i}\",\n",
+    "                width=f\"WW_{i}\",\n",
+    "                height=\"L\"\n",
+    "            )\n",
+    "            add_wire(\n",
+    "                name=f\"LM_out_{i}\",\n",
+    "                width=f\"WW_{i}\",\n",
+    "            )\n",
+    "            \n",
+    "        # 实例化HN\n",
+    "        hn_params = {\n",
+    "            \"H\": H,\n",
+    "            \"L\": L,\n",
+    "        }\n",
+    "        for i in range(len(value_range)):\n",
+    "            hn_params[f\"WW_{i}\"] = f\"WW_{i}\"\n",
+    "        hn_ports = {\n",
+    "            \"HN_in\": \"SW_in\",\n",
+    "        }\n",
+    "        for i in range(len(value_range)):\n",
+    "            hn_ports[f\"HN_out_{i}\"] = f\"HN_out_{i}\"\n",
+    "        add_instance(\"HN\"+module_name_suffix, \"hn\", hn_params, hn_ports)\n",
+    "        \n",
+    "        # 实例化LM\n",
+    "        lm_params = {\n",
+    "            \"L\": L,\n",
+    "        }\n",
+    "        for i in range(len(value_range)):\n",
+    "            lm_params[f\"WW_{i}\"] = f\"WW_{i}\"\n",
+    "        lm_ports = {\n",
+    "            \"LM_sel\": \"LM_sel\",\n",
+    "        }\n",
+    "        for i in range(len(value_range)):\n",
+    "            lm_ports[f\"LM_in_{i}\"] = f\"HN_out_{i}\"\n",
+    "            lm_ports[f\"LM_out_{i}\"] = f\"LM_out_{i}\"\n",
+    "        add_instance(\"Layer_mux\"+module_name_suffix, \"layer_mux\", lm_params, lm_ports)\n",
+    "\n",
+    "        # 实例化WT\n",
+    "        wt_params = {}\n",
+    "        for i in range(len(value_range)):\n",
+    "            wt_params[f\"WW_{i}\"] = f\"WW_{i}\"\n",
+    "        wt_ports = {\"clk\":\"clk\",\"tre_rstn\":\"tre_rstn\",\"valid\":\"valid\"}\n",
+    "        for i in range(len(value_range)):\n",
+    "            wt_ports[f\"WT_{i}_in\"] = f\"LM_out_{i}\"\n",
+    "            wt_ports[f\"WT_{i}_out_S\"] = f\"WT_{i}_out_S\"\n",
+    "            wt_ports[f\"WT_{i}_out_C\"] = f\"WT_{i}_out_C\"\n",
+    "        add_instance(\"WT_group\"+module_name_suffix, \"wt_group\", wt_params, wt_ports)\n",
+    "    return module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def process_task(i, weights_file_name, matrix, H, L):\n",
+    "    try:\n",
+    "        WW = calculate_WW(matrix, CFG.value_range)\n",
+    "        file_dir = os.path.join(CFG.path_dir, weights_file_name)\n",
+    "        os.makedirs(file_dir, exist_ok=True)\n",
+    "        file_name = os.path.join(\n",
+    "            file_dir, f\"{CFG.path_dir}_tp_{weights_file_name}_vc_{i}.sv\"\n",
+    "        )\n",
+    "        with open(file_name, \"w\") as f:\n",
+    "            f.write(\n",
+    "                generate_module(\n",
+    "                    matrix,\n",
+    "                    module_name_suffix=f\"_tp_{weights_file_name}_vc_{i}\",\n",
+    "                    H=H,\n",
+    "                    L=L,\n",
+    "                    value_range=CFG.value_range,\n",
+    "                    WW=WW,\n",
+    "                ).generate()\n",
+    "            )\n",
+    "        return i  # 返回任务ID以显示进度\n",
+    "    except Exception as e:\n",
+    "        print(f\"Generating {i} failed with an error: {e}\")\n",
+    "        return None\n",
+    "\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    os.makedirs(CFG.path_dir, exist_ok=True)\n",
+    "    for weights_file in os.listdir(CFG.weights_dir):\n",
+    "        if weights_file != \"k.pkl\":\n",
+    "            continue\n",
+    "        weights_path = os.path.join(CFG.weights_dir, weights_file)\n",
+    "        weights_file_name = os.path.splitext(weights_file)[0]\n",
+    "        print(f\"Processing {weights_file_name}\")\n",
+    "        with open(weights_path, \"rb\") as f:\n",
+    "            print(f\"Loading {weights_file_name}\")\n",
+    "            matrixs = pickle.load(f)\n",
+    "            matrixs = np.transpose(matrixs, (1, 0, 2))\n",
+    "        VN, L, H = matrixs.shape\n",
+    "        for i in tqdm(range(VN)):\n",
+    "            process_task(i, weights_file_name, matrixs[i], H, L)\n",
+    "        # with ProcessPoolExecutor(max_workers=CFG.num_workers) as executor:\n",
+    "        #     futures = [\n",
+    "        #         executor.submit(process_task, i, weights_file_name, matrixs[i], H, L)\n",
+    "        #         for i in range(VN)\n",
+    "        #     ]\n",
+    "        #     for future in tqdm(as_completed(futures), total=VN):\n",
+    "        #         try:\n",
+    "        #             result = future.result()\n",
+    "        #         except Exception as e:\n",
+    "        #             print(f\"Generating {result} failed with an error: {e}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/archived/没用的/generate_tfsm.py
+++ b/archived/没用的/generate_tfsm.py
+import numpy as np
+import numpy as np
+import Pyrilog.Pyrilog as pl
+import os
+
+
+class CFG:
+    path_dir = "TFSM"
+    H = 16
+    L = 5
+    VN = 5
+    WP = 4
+    AP = 8
+    SCW = 11
+    SCWB = 4
+    TTW = 16
+    value_range = [x for x in range(-8, 8) if x != 0]
+    value_dict = {x: f"{'pos' if x >0 else 'neg'}_{abs(x)}" for x in value_range}
+    WW = [8] * len(value_range)
+
+
+def generate_verilog_code(
+    path_dir,
+    file_id=0,
+    H=16,
+    L=5,
+    VN=5,
+    WP=4,
+    AP=8,
+    SCW=11,
+    SCWB=4,
+    TTW=16,
+    value_range=[-1, 1],
+    value_dict={-1: "neg_1", 1: "pos_1"},
+    WW={-1: 8, 1: 8},
+):
+    file_name = os.path.join(path_dir, f"TFSM_{file_id}.sv")
+    generator = pl.VerilogGenerator()
+    module = pl.ModuleBlock(f"TFSM_{file_id}")
+
+    # 增加参数
+    module.add_parameter("H", str(H))
+    module.add_parameter("L", str(L))
+    module.add_parameter("VN", str(VN))
+    module.add_parameter("WP", str(WP))
+    module.add_parameter("AP", str(AP))
+    module.add_parameter("SCW", str(SCW))
+    module.add_parameter("SCWB", str(SCWB))
+    module.add_parameter("TTW", str(TTW))
+    for i in value_range:
+        module.add_parameter(f"WW_{value_dict[i]}", str(WW[i]))
+
+    # 增加输入输出
+    module.add_input("clk")
+    module.add_input("tree_rstn")
+    module.add_input("valid")
+    module.add_input("fsm_rstn")
+    module.add_input("LM_sel", width="L")
+    module.add_input("Top_in", width="AP")
+    module.add_output("WT_result_acc", pl.VAR_TYPE.REG, "TTW", "VN")
+    module.add_output("result_valid", pl.VAR_TYPE.REG)
+
+    module.add_reg("TM_sel", "AP")
+    module.add_wire("TM_out", "H")
+
+    # 实例化 Top_mux
+    Top_mux_params = {"H": "H", "AP": "AP"}
+    Top_mux_ports = {"TM_sel": "TM_sel", "TM_in": "TM_in", "TM_out": "TM_out"}
+    module.add_instance("Top_mux", "top_mux", Top_mux_params, Top_mux_ports)
+
+    for i in value_range:
+        module.add_wire(f"WT_{value_dict[i]}_out_S", height="VN")
+        module.add_wire(f"WT_{value_dict[i]}_out_C", height="VN")
+
+    module.add_reg("tree_rstn")
+    module.add_reg("mac_rstn")
+    for i in value_range:
+        module.add_reg(f"final_S_{value_dict[i]}", "SCW", "VN")
+        module.add_reg(f"final_C_{value_dict[i]}", "SCW", "VN")
+
+    module.add_reg("MAC_in_1", "SCW+2", "VN")
+    module.add_reg("MAC_in_2", "WP", "VN")
+    module.add_wire("MAC_out", "TTW+1", "VN")
+
+    for i in value_range:
+        module.add_reg(f"WT_result_{value_dict[i]}", "SCW+2", "VN")
+
+    module.add_reg("idx", "SCWB+1")
+    module.add_reg("state", "3")
+
+    module.add_genvar("j")
+    module.add_integer("i")
+
+    with pl.GenerateBlock(module) as generate_block:
+        with pl.ForBlock(
+            generate_block, "j=0", "j<VN", "j=j+1", "inst_SW+loop"
+        ) as for_block:
+            SW_params = {"H": "H", "L": "L"}
+            for i in value_range:
+                SW_params[f"WW_{value_dict[i]}"] = f"WW_{value_dict[i]}"
+            SW_ports = {
+                "clk": "clk",
+                "tree_rstn": "tree_rstn",
+                "valid": "valid",
+                "LM_sel": "LM_sel",
+                "SW_in": "TM_out",
+            }
+            for i in value_range:
+                SW_ports[f"WT_out_{value_dict[i]}_S"] = f"WT_{value_dict[i]}_out_S[j]"
+                SW_ports[f"WT_out_{value_dict[i]}_C"] = f"WT_{value_dict[i]}_out_C[j]"
+            for_block.add_instance(f"SW_{file_id}", "sub_wrapper", SW_params, SW_ports)
+            MAC_params = {"W_1": "SCW", "W_2": "WP", "W_0": "TTW"}
+            MAC_ports = {
+                "clk": "clk",
+                "tree_rstn": "tree_rstn",
+                "MAC_in_1": "MAC_in_1[j]",
+                "MAC_in_2": "MAC_in_2[j]",
+                "MAC_out": "MAC_out[j]",
+            }
+            for_block.add_instance("MAC", "mac", MAC_params, MAC_ports)
+
+    with pl.AlwaysBlock(module, "posedge clk or negedge fsm_rstn") as always_block:
+        with pl.IfBlock(always_block, "!fsm_rstn") as if_block:
+            if_block = pl.IfBlock("!fsm_rstn")
+            if_block.add_body("state <= 0;")
+            if_block.add_body("idx <= 0;")
+            if_block.add_body("tree_rstn <= 0;")
+            if_block.add_body("mac_rstn <= 0;")
+            if_block.add_body("result_valid <= 0;")
+            if_block.add_body("TM_sel <= 8'b00000000;")
+            with pl.ForBlock(if_block, "i=0", "i<VN", "i=i+1") as for_block:
+                for_block.add_body("MAC_in_1[i] <= 0;")
+                for_block.add_body("MAC_in_2[i] <= 0;")
+        with pl.ElseBlock(always_block) as else_block:
+            with pl.IfBlock(else_block, "state == 0") as if_block:
+                if_block.add_body("idx <= 0;")
+                if_block.add_body("tree_rstn <= 0;")
+                if_block.add_body("result_valid <= 0;")
+                with pl.ForBlock(if_block, "i=0", "i<VN", "i=i+1") as for_block:
+                    for i in value_range:
+                        for_block.add_body(f"final_S_{value_dict[i]}[i] <= 0;")
+                        for_block.add_body(f"final_C_{value_dict[i]}[i] <= 0;")
+                        for_block.add_body(f"WT_result_{value_dict[i]}[i] <= 0;")
+                    for_block.add_body(f"WT_result_acc[i] <= 0;")
+                # 写不动了，基本就是一行一行翻译的verilog代码
+                with pl.IfBlock(if_block, "valid == 1") as if_if_block:
+                    if_if_block.add_body("state <= 1;")
+            with pl.IfBlock(else_block, "state == 1") as if_block:
+                with pl.IfBlock(if_block, "valid == 1") as if_if_block:
+                    if_if_block.add_body("tree_rstn <= 1;")
+                    if_if_block.add_body("state <= 2;")
+
+    generator.add_module(module)
+    generator.generate(file_name)
+
+
+if __name__ == "__main__":
+    os.makedirs(CFG.path_dir, exist_ok=True)
+    record_path = os.path.join(CFG.path_dir, "records.txt")
+    record_f = open(record_path, "w")
+    record_f.write(f"H={CFG.H}, L={CFG.L}, value_range={CFG.value_range}\n\n")
+    for i in range(CFG.VN):
+        generate_verilog_code(
+            CFG.path_dir,
+            file_id=i,
+            H=CFG.H,
+            L=CFG.L,
+            value_range=CFG.value_range,
+            value_dict=CFG.value_dict,
+            WW=CFG.WW,
+        )
--- a/archived/没用的/generate_wt.ipynb
+++ b/archived/没用的/generate_wt.ipynb
--- a/archived/没用的/quant/loadnpy.py
+++ b/archived/没用的/quant/loadnpy.py
+import numpy as np
+import numpy as np
+import pickle
+import matplotlib.pyplot as plt
+
+length = 3840
+file_path = f"weights-fp32-{length}.pkl"
+
+
+# 载入权重矩阵
+with open(file_path, "rb") as f:
+    weights = pickle.load(f)
+
+# 检验weights的维度
+print(f"weight matrix shape: {weights.shape}")
+# # new_shape = (2304, 288)  # 576组，每组1152个元素
+# # weights = weights.reshape(new_shape)
+
+# 确定缩放因子以使用int4范围(-8到7)
+max_value = np.max(weights)
+min_value = np.min(weights)
+
+# 归一化权重到int4范围
+normalized_weights = (weights - min_value) / (max_value - min_value) * 15 - 8
+normalized_weights = np.round(normalized_weights)
+
+# 限制值确保其在int4范围内
+quantized_weights = np.clip(normalized_weights, -8, 7).T
+
+# 计算每组中各int4值的频数
+int4_values = np.arange(-8, 8)
+frequency_counts = np.zeros((length, len(int4_values)))
+
+
+for i in range(length):
+    frequency_counts[i, :] = np.histogram(
+        quantized_weights[i], bins=np.arange(-8.5, 8.5)
+    )[0]
+
+# 计算每个int4取值在所有组中的标准差和极差
+std_devs_per_value = np.std(frequency_counts, axis=0)
+ranges_per_value = np.ptp(frequency_counts, axis=0)
+
+# 打印结果
+print("Standard deviations per int4 value:", std_devs_per_value)
+print("Ranges per int4 value:", ranges_per_value)
+
+# 计算每个int4取值在所有组中的最大频次
+max_frequencies_per_value = np.max(frequency_counts, axis=0)
+
+# 计算所有取值的最大频次的总和
+total_max_frequency_sum = np.sum(max_frequencies_per_value)
+
+# 打印每个取值的最大频次和总和
+print("Maximum frequency per int4 value:", max_frequencies_per_value)
+print("Sum of maximum frequencies:", total_max_frequency_sum)
+
+# 绘制标准差和极差的图
+plt.figure(figsize=(12, 6))
+plt.subplot(1, 2, 1)
+plt.bar(int4_values, std_devs_per_value, color="blue")
+plt.title("Standard Deviation of Frequency per int4 Value")
+plt.xlabel("int4 Value")
+plt.ylabel("Standard Deviation")
+
+plt.subplot(1, 2, 2)
+plt.bar(int4_values, ranges_per_value, color="red")
+plt.title("Range of Frequency per int4 Value")
+plt.xlabel("int4 Value")
+plt.ylabel("Range")
+
+plt.tight_layout()
+plt.show()
--- a/archived/没用的/quant/loadpt.ipynb
+++ b/archived/没用的/quant/loadpt.ipynb
+{
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'utils_quant'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 6\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtqdm\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m tqdm\n\u001b[0;32m----> 6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mutils_quant\u001b[39;00m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpickle\u001b[39;00m\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'utils_quant'"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import safetensors\n",
+    "import torch\n",
+    "from tqdm import tqdm\n",
+    "import utils_quant\n",
+    "import pickle\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tensors_1536=[]\n",
+    "tensors_3840=[]\n",
+    "file_path=\"/lustre/S/huangdi/open_for_out/models/aimo-progress-prize-trained-models/Code-Math-QA-Proof-quant-per-head-fp4-0913/model.safetensors\"\n",
+    "with safetensors.safe_open(file_path,framework=\"pt\") as f:\n",
+    "    for i,key in enumerate(tqdm(f.keys())):\n",
+    "        # print(key,f.get_tensor(key).shape)\n",
+    "        if i>10:\n",
+    "            break\n",
+    "        tensor=f.get_tensor(key)\n",
+    "        if tensor.ndim==2:\n",
+    "            if len(tensor[0])==1536:\n",
+    "                tensors_1536.extend(tensor.float().tolist())\n",
+    "            else:\n",
+    "                tensors_3840.extend(tensor.float().tolist())\n",
+    "        else:\n",
+    "            if len(tensor)==1536:\n",
+    "                tensors_1536.append(tensor.float().tolist())\n",
+    "            else:\n",
+    "                tensors_3840.append(tensor.float().tolist())\n",
+    "tensors_1536=np.array(tensors_1536)\n",
+    "tensors_3840=np.array(tensors_3840)\n",
+    "# tensors=np.array(tensors,dtype=np.float32)\n",
+    "# display(tensors_fp32[:5])\n",
+    "with open(\"weights-fp32-1536-small.pkl\",\"wb\") as f:\n",
+    "    pickle.dump(tensors_1536, f)\n",
+    "with open(\"weights-fp32-3840-small.pkl\",\"wb\") as f:\n",
+    "    pickle.dump(tensors_3840, f)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/archived/没用的/simulate_hn.ipynb
+++ b/archived/没用的/simulate_hn.ipynb
+{
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pickle\n",
+    "import numpy as np\n",
+    "from tqdm import tqdm\n",
+    "from prettytable import PrettyTable\n",
+    "from config import CFG\n",
+    "import os\n",
+    "from multiprocessing import Pool"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def calculate_WW(matrix: np.array, value_range):\n",
+    "    WW = [0] * len(value_range)\n",
+    "    for i in range(len(value_range)):\n",
+    "        WW[i] = max(\n",
+    "            [len([x for x in row if abs(x - value_range[i]) <= 0.01]) for row in matrix]\n",
+    "        )\n",
+    "    return WW\n",
+    "\n",
+    "def find_index(arr, target, epsilon=1e-3):\n",
+    "    arr = np.array(arr)  # 转换为numpy数组\n",
+    "    diff = np.abs(arr - target)  # 计算差值数组\n",
+    "    min_diff = np.min(diff)  # 找到最小的差值\n",
+    "    if min_diff < epsilon:  # 如果最小差值在允许的误差范围内\n",
+    "        return np.where(diff == min_diff)[0][0]  # 返回第一个匹配的索引\n",
+    "    raise ValueError(\"No match found\")  # 如果没有找到匹配项，则引发异常\n",
+    "\n",
+    "#返回第i位\n",
+    "def get_bit(num, i):\n",
+    "    if i < 0:\n",
+    "        return 0\n",
+    "    return (num >> i) & 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class HN:\n",
+    "    def __init__(self, matrix,H,L):\n",
+    "        self.matrix = matrix\n",
+    "        self.H=H\n",
+    "        self.L=L\n",
+    "    \n",
+    "    # def find_index(self, value):\n",
+    "    #     return np.searchsorted(CFG.value_range, value)\n",
+    "    \n",
+    "    def calculate(self,HN_in:np.ndarray):\n",
+    "        HN_out = np.zeros((self.L, len(CFG.value_range)), dtype=int)\n",
+    "        ans=np.zeros(self.L)\n",
+    "        matrix_masked = self.matrix * HN_in\n",
+    "        for i, layer in enumerate(matrix_masked):\n",
+    "            for j, value in enumerate(layer):\n",
+    "                if abs(value)<=1e-3:\n",
+    "                    continue\n",
+    "                index=find_index(CFG.value_range,value)\n",
+    "                HN_out[i][index]+=1\n",
+    "                ans[i]+=value\n",
+    "            # indices=list(map(self.find_index,layer))\n",
+    "            # np.add.at(HN_out[i],indices,1)\n",
+    "        return HN_out,ans\n",
+    "    \n",
+    "class HN_GROUP:\n",
+    "    def __init__(self,weights:np.ndarray):\n",
+    "        self.VN,self.L,self.H=weights.shape\n",
+    "        print(weights.shape)\n",
+    "        self.HN_GROUP=[HN(matrix,self.H,self.L) for matrix in weights]\n",
+    "        print(\"HN_GROUP init done\")\n",
+    "        \n",
+    "        \n",
+    "    def calculate_single(args):\n",
+    "        hn, hn_in = args\n",
+    "        return hn.calculate(hn_in)\n",
+    "    \n",
+    "    def calculate(self,hn_in:np.ndarray,layer:int):\n",
+    "        hn_out=[None]*self.VN\n",
+    "        ans=[None]*self.VN\n",
+    "        with Pool() as pool:\n",
+    "            args = [(self.HN_GROUP[i], hn_in) for i in range(self.VN)]\n",
+    "            results = list(tqdm(pool.imap(self.calculate_single, args), total=self.VN))\n",
+    "        for i, result in enumerate(results):\n",
+    "            hn_out[i], ans[i] = result\n",
+    "        ans = [x[layer] for x in ans]\n",
+    "        return ans\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(512, 52, 1536)\n",
+      "HN_GROUP init done\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 0/512 [00:00<?, ?it/s]"
+     ]
+    }
+   ],
+   "source": [
+    "if CFG.mode == \"test\":\n",
+    "    weights_name = CFG.test_weights\n",
+    "elif CFG.mode == \"run\":\n",
+    "    weights_name = CFG.run_weigths\n",
+    "else:\n",
+    "    raise ValueError(\"Invalid mode\")\n",
+    "activation_name=os.path.join(CFG.activation_dir,\"activation.pkl\")\n",
+    "result_name=os.path.join(CFG.results_dir,\"result.txt\")\n",
+    "with open(activation_name, \"rb\") as f:\n",
+    "    hn_in=pickle.load(f)\n",
+    "weights_path = os.path.join(CFG.weights_dir, weights_name)\n",
+    "with open(weights_path, \"rb\") as f:\n",
+    "    matrixs = pickle.load(f)\n",
+    "    matrixs = np.transpose(matrixs, (1, 0, 2))\n",
+    "\n",
+    "hn_group=HN_GROUP(matrixs)\n",
+    "\n",
+    "hn_in=get_bit(hn_in,7)\n",
+    "\n",
+    "hn_group.calculate(hn_in,0)\n",
+    "# for matrix in matrixs:\n",
+    "#         table=PrettyTable()\n",
+    "#         table.field_names=[str(f) for f in CFG.value_range]\n",
+    "#         print(\"--------------------------------\")\n",
+    "#         print(\"输入\")\n",
+    "#         print(hn_in)\n",
+    "#         print(\"权重\")\n",
+    "#         print(matrix)\n",
+    "#         print(\"结果-单独\")\n",
+    "#         hn_out,ans=hn.calculate(hn_in)\n",
+    "#         table.add_rows(hn_out)\n",
+    "#         print(table)\n",
+    "#         print(\"结果-总和\")\n",
+    "#         print(ans)\n",
+    "#         print()\n",
+    "#         break\n",
+    "        \n",
+    "    \n",
+    "    "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/hllm/__init__.py
+++ b/hllm/__init__.py
--- a/hllm/config.py
+++ b/hllm/config.py
+import os
+import sys
+import pickle
+
+
+def update_weights_shape(weights):
+    weights_file = os.path.join(CFG.weights_dir, weights)
+    with open(weights_file) as f:
+        weights = pickle.load(f)
+        shape = weights.shape
+    return shape
+
+
+class CFG:
+    def __init__(self):
+        self.mode = "run"  # "test" or "run"
+
+        self.run_weights_batch = [
+            "down.pkl",
+            "up.pkl",
+            "gate.pkl",
+            "k.pkl",
+            "o.pkl",
+            "v.pkl",
+            "q.pkl",
+        ]
+        self.run_weights = "down.pkl"  # 用于赋值
+        self.safetensors = "model.safetensors"
+
+        self.weights_dir = "001-H-LLM/qwen"
+        self.mapped_weights_dir = "001-H-LLM/qwen/mapped_weights"
+
+        self.verify_generate_activation_on_exist = False
+
+        self.num_workers = 64
+        self.value_range = [-6, -4, -3, -2, -1.5, -1, -0.5, 0.5, 1, 1.5, 2, 3, 4, 6]
+        self.python_path = sys.executable
+
+        self.group_number = 32
+
+        self.output_dir = "outputs"
+
+        os.makedirs(self.weights_dir, exist_ok=True)
+        os.makedirs(self.output_dir, exist_ok=True)
--- a/hllm/eda/__init__.py
+++ b/hllm/eda/__init__.py
--- a/hllm/eda/eda_quant.ipynb
+++ b/hllm/eda/eda_quant.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pickle\n",
+    "from tqdm import tqdm\n",
+    "import matplotlib.pyplot as plt\n",
+    "from pathlib import Path\n",
+    "from prettytable import PrettyTable\n",
+    "from  copy import deepcopy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def eda_weights(weights):\n",
+    "    fp4_values = np.array([-6, -4, -3, -2, -1.5, -1, -0.5, 0.5, 1, 1.5, 2, 3, 4, 6])\n",
+    "    weights=weights.reshape(weights.size//weights.shape[-1],weights.shape[-1])\n",
+    "    frequency_counts = np.zeros((len(weights), len(fp4_values)))\n",
+    "    bins = np.concatenate((fp4_values - 0.01, [fp4_values[-1] + 0.01]))\n",
+    "    for i in tqdm(range(len(weights))):\n",
+    "        frequency_counts[i]=np.histogram(weights[i],bins=bins)[0]\n",
+    "        # 计算每个int4取值在所有组中的标准差和极差\n",
+    "    # display(frequency_counts)\n",
+    "    # std_devs_per_value = np.std(frequency_counts, axis=0)\n",
+    "    # ranges_per_value = np.ptp(frequency_counts, axis=0)\n",
+    "    # 打印结果\n",
+    "\n",
+    "    # print(\"Standard deviations per int4 value:\", std_devs_per_value)\n",
+    "    # print(\"Ranges per int4 value:\", ranges_per_value)\n",
+    "    # 计算每个int4取值在所有组中的最大频次\n",
+    "    max_frequencies_per_value = np.max(frequency_counts, axis=0)\n",
+    "    # 计算所有取值的最大频次的总和\n",
+    "    total_max_frequency_sum = np.sum(max_frequencies_per_value)\n",
+    "    # 打印每个取值的最大频次和总和\n",
+    "    # print(\"Maximum frequency per int4 value:\", max_frequencies_per_value)\n",
+    "    print(\"Sum of maximum frequencies:\", total_max_frequency_sum)\n",
+    "    \n",
+    "    table=PrettyTable()\n",
+    "    table.field_names=[\"type\"]+[str(f) for f in fp4_values]\n",
+    "    # table.add_row([\"std\"]+[str(round(f,3)) for f in std_devs_per_value])\n",
+    "    # table.add_row([\"range\"]+[str(round(f,3)) for f in ranges_per_value])\n",
+    "    table.add_row([\"max\"]+[str(round(f,3)) for f in max_frequencies_per_value])\n",
+    "    print(table)\n",
+    "    # 绘制标准差和极差的图\n",
+    "    # plt.figure(figsize=(12, 6))\n",
+    "    # plt.subplot(1, 2, 1)\n",
+    "    # plt.bar(fp4_values, std_devs_per_value, color=\"blue\")\n",
+    "    # plt.title(\"Standard Deviation of Frequency per int4 Value\")\n",
+    "    # plt.xlabel(\"int4 Value\")\n",
+    "    # plt.ylabel(\"Standard Deviation\")\n",
+    "\n",
+    "    # plt.subplot(1, 2, 2)\n",
+    "    # plt.bar(fp4_values, ranges_per_value, color=\"red\")\n",
+    "    # plt.title(\"Range of Frequency per int4 Value\")\n",
+    "    # plt.xlabel(\"int4 Value\")\n",
+    "    # plt.ylabel(\"Range\")\n",
+    "\n",
+    "    # plt.tight_layout()\n",
+    "    # plt.show()\n",
+    "\n",
+    "def eda_weights_52(weights):\n",
+    "    fp4_values = np.array([-6, -4, -3, -2, -1.5, -1, -0.5,0.5, 1, 1.5, 2, 3, 4, 6])\n",
+    "    weights=np.transpose(weights,(1,0,2))#512,52,1536\n",
+    "    frequency_counts = np.zeros((len(weights), 52,len(fp4_values)))\n",
+    "    bins = np.concatenate((fp4_values - 0.01, [fp4_values[-1] + 0.01]))\n",
+    "    for i in tqdm(range(len(weights))):\n",
+    "        for j in range(52):\n",
+    "            frequency_counts[i,j]=np.histogram(weights[i,j],bins=bins)[0]\n",
+    "        # 计算每个int4取值在所有组中的标准差和极差 512,14\n",
+    "    frequency_counts=np.max(frequency_counts,axis=1)\n",
+    "    # std_devs_per_value = np.std(frequency_counts, axis=0)\n",
+    "    # ranges_per_value = np.ptp(frequency_counts, axis=0)\n",
+    "    # 打印结果\n",
+    "\n",
+    "    # print(\"Standard deviations per int4 value:\", std_devs_per_value)\n",
+    "    # print(\"Ranges per int4 value:\", ranges_per_value)\n",
+    "    # 计算每个int4取值在所有组中的最大频次\n",
+    "    mean_frequencies_per_value = np.mean(frequency_counts, axis=0)\n",
+    "    # 计算所有取值的最大频次的总和\n",
+    "    total_mean_frequency_sum = np.sum(mean_frequencies_per_value)\n",
+    "    # 打印每个取值的最大频次和总和\n",
+    "    # print(\"Maximum frequency per int4 value:\", max_frequencies_per_value)\n",
+    "    print(\"Sum of mean frequencies:\", total_mean_frequency_sum)\n",
+    "    \n",
+    "    table=PrettyTable()\n",
+    "\n",
+    "    table.field_names=[\"type\"]+[str(f) for f in fp4_values]\n",
+    "    # table.add_row([\"std\"]+[str(round(f,3)) for f in std_devs_per_value])\n",
+    "    # table.add_row([\"range\"]+[str(round(f,3)) for f in ranges_per_value])\n",
+    "    table.add_row([\"mean\"]+[str(round(f,3)) for f in mean_frequencies_per_value])\n",
+    "    print(table)\n",
+    "    # 绘制标准差和极差的图\n",
+    "    # plt.figure(figsize=(12, 6))\n",
+    "    # plt.subplot(1, 2, 1)\n",
+    "    # plt.bar(fp4_values, std_devs_per_value, color=\"blue\")\n",
+    "    # plt.title(\"Standard Deviation of Frequency per int4 Value\")\n",
+    "    # plt.xlabel(\"int4 Value\")\n",
+    "    # plt.ylabel(\"Standard Deviation\")\n",
+    "\n",
+    "    # plt.subplot(1, 2, 2)\n",
+    "    # plt.bar(fp4_values, ranges_per_value, color=\"red\")\n",
+    "    # plt.title(\"Range of Frequency per int4 Value\")\n",
+    "    # plt.xlabel(\"int4 Value\")\n",
+    "    # plt.ylabel(\"Range\")\n",
+    "\n",
+    "    # plt.tight_layout()\n",
+    "    # plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path = Path(\"weights\")\n",
+    "for file_path in path.rglob('*'):\n",
+    "    if file_path.is_file() and \"proj.pkl\"in str(file_path):\n",
+    "        with open(file_path, \"rb\") as f :\n",
+    "            print(f\"Reading file {file_path}\")\n",
+    "            weights = pickle.load(f)\n",
+    "            print(f\"weight matrix shape: {weights.shape}\")\n",
+    "            eda_weights(deepcopy(weights))\n",
+    "            eda_weights_52(deepcopy(weights))\n",
+    "            print()\n",
+    "            "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/hllm/eda/generate_quant_weights.ipynb
+++ b/hllm/eda/generate_quant_weights.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import safetensors\n",
+    "import torch\n",
+    "from tqdm import tqdm\n",
+    "from utils_quant import quant_and_dequant\n",
+    "import pickle\n",
+    "from hllm.config import CFG\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 837/837 [04:13<00:00,  3.30it/s]\n",
+      "100%|██████████| 11/11 [02:13<00:00, 12.17s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "weights = {\n",
+    "    \"input_layernorm\": [],\n",
+    "    \"down_proj\": [],\n",
+    "    \"gate_proj\": [],\n",
+    "    \"up_proj\": [],\n",
+    "    \"post_attention_layernorm\": [],\n",
+    "    \"k_proj\": [],\n",
+    "    \"o_proj\": [],\n",
+    "    \"q_proj\": [],\n",
+    "    \"v_proj\": [],\n",
+    "    \"embed_tokens\": [],\n",
+    "    \"model.norm\": [],\n",
+    "}\n",
+    "\n",
+    "\n",
+    "# 列表中是否有字符串的子串\n",
+    "def is_substring_in_list(substring, string_list):\n",
+    "    return any(s in substring for s in string_list)\n",
+    "\n",
+    "\n",
+    "ignored_weights = [\n",
+    "    \"embed_tokens.weight\",\n",
+    "    \"post_attention_layernorm.weight\",\n",
+    "    \"activation_quant\",\n",
+    "    \"input_layernorm.weight\",\n",
+    "]\n",
+    "\n",
+    "file_path = \"../001-H-LLM/weights1026/model.safetensors\"\n",
+    "with safetensors.safe_open(file_path, framework=\"pt\") as f:\n",
+    "    for i, key in enumerate(tqdm(f.keys())):\n",
+    "        if is_substring_in_list(key, ignored_weights):\n",
+    "            continue\n",
+    "        tensor = f.get_tensor(key)\n",
+    "        # print(key,tensor.shape)\n",
+    "        # if i>10:\n",
+    "        #     break\n",
+    "        tensor = quant_and_dequant(tensor, 4).tolist()\n",
+    "        for k in weights.keys():\n",
+    "            if k in key:\n",
+    "                weights[k].append(tensor)\n",
+    "\n",
+    "for key in tqdm(weights.keys()):\n",
+    "    weights[key] = np.array(weights[key])\n",
+    "    file_path = f\"../001-H-LLM/weights1026/{key}.pkl\"\n",
+    "    with open(file_path, \"wb\") as f:\n",
+    "        pickle.dump(weights[key], f)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/hllm/eda/generate_quant_weights.py
+++ b/hllm/eda/generate_quant_weights.py
+import os
+import numpy as np
+import safetensors
+import pickle
+from tqdm import tqdm
+from hllm.eda.utils_quant import quant_and_dequant
+from hllm.config import CFG
+
+
+# %%
+name_dict = {
+    "down_proj.weight": "down",
+    "gate_proj.weight": "gate",
+    "up_proj.weight": "up",
+    "k_proj.weight": "k",
+    "o_proj.weight": "o",
+    "q_proj.weight": "q",
+    "v_proj.weight": "v",
+    "model.norm.weight": "norm",
+}
+weights = {v: [] for v in name_dict.values()}
+
+
+# 列表中是否有字符串的子串
+def is_substring_in_list(substring, string_list):
+    return any(s in substring for s in string_list)
+
+
+def run(config: CFG):
+    file_path = os.path.join(config.weights_dir, config.safetensors)
+    with safetensors.safe_open(file_path, framework="pt") as f:
+        for i, key in enumerate(tqdm(f.keys())):
+            if not is_substring_in_list(key, name_dict.keys()):
+                continue
+            tensor = f.get_tensor(key)
+            tensor = quant_and_dequant(tensor, 4)
+            for k in name_dict.keys():
+                if k in key:
+                    weights[name_dict[k]].append(tensor.tolist())
+
+    for k in weights.keys():
+        weights[k] = np.array(weights[k])
+        file_path = os.path.join(config.weights_dir, f"{k}.pkl")
+        with open(file_path, "wb") as f:
+            pickle.dump(weights[k], f)
--- a/hllm/eda/mapping_weights.py
+++ b/hllm/eda/mapping_weights.py
+import numpy as np
+import os
+from hllm.config import CFG
+import pickle
+from tqdm import tqdm
+
+
+def mapping_weights(weights, value_range):
+    new_weights = np.full_like(weights, -1, dtype=int)
+    for i in range(len(value_range)):
+        new_weights[abs(weights - value_range[i]) <= 0.01] = i
+    return new_weights
+
+
+def run(config: CFG):
+    print("Start mapping weights")
+    path_dir = os.path.join(config.mapped_weights_dir)
+    os.makedirs(path_dir, exist_ok=True)
+    value_range = config.value_range
+    for file in os.listdir(config.weights_dir):
+        if file in config.run_weights_batch:
+            with open(os.path.join(config.weights_dir, file), "rb") as f:
+                print(f"Loading {file}")
+                matrixs = pickle.load(f)
+                matrixs = np.transpose(matrixs, (1, 0, 2))
+                VN, L, H = matrixs.shape
+                print(VN, L, H)
+                new_weights = mapping_weights(matrixs, value_range)
+                new_weights = np.transpose(new_weights, (1, 0, 2))
+                with open(os.path.join(path_dir, file), "wb") as f:
+                    pickle.dump(new_weights, f)
+    print("Mapped weights at", path_dir)
--- a/hllm/eda/utils_quant.py
+++ b/hllm/eda/utils_quant.py
+import math
+import torch
+from torch import nn
+
+
+def weight_quant(weight, num_bits=1):
+    dtype = weight.dtype
+    weight = weight.float()
+    Qn = -(2 ** (num_bits - 1))
+    Qp = 2 ** (num_bits - 1) - 1
+    s = Qp / weight.abs().mean().clamp(min=1e-5)
+    result = (weight * s).round().clamp(Qn, Qp) / s
+    return result.type(dtype)
+
+
+def activation_quant(x, num_bits=8):
+    dtype = x.dtype
+    x = x.float()
+    Qn = -(2 ** (num_bits - 1))
+    Qp = 2 ** (num_bits - 1) - 1
+    s = Qp / x.abs().max(dim=-1, keepdim=True).values.clamp(min=1e-5)
+    result = (x * s).round().clamp(Qn, Qp) / s
+    return result.type(dtype)
+
+
+def get_scale_f32(src_amax, dst_max):
+    S = (src_amax.float()) / dst_max
+    qscale = 1 / S
+    dqscale = S
+    return qscale, dqscale
+
+
+def round_to_FP4(input):
+    dst_max = 6.0
+    emax = 2
+    emin = 0
+    p = 2
+    part = 2 - 2 ** (1 - p)
+    ab = torch.where(
+        torch.isinf(input) + torch.isnan(input), torch.ones_like(input) * dst_max, input
+    )
+    ab = torch.where(ab > dst_max, torch.ones_like(ab) * dst_max, ab)
+    ab = torch.where(ab < 2.0 ** (emin) * 2 ** (-p), torch.zeros_like(ab), ab)
+    E = torch.where(
+        ab < 2 ** (emin),
+        torch.ones_like(ab) * (emin),
+        torch.floor(torch.log2(ab.float())),
+    )
+    P = torch.round(ab * 2 ** (-E) * 2 ** (p - 1)) / 2 ** (p - 1)
+    data = 2**E * P
+    return data
+
+
+def quant_and_dequant(data, num_bits):
+    sign = torch.sign(data)
+    abs_data = torch.abs(data).float()
+    amax, index = torch.max(
+        abs_data, -1, True
+    )  # 这个示例是做的per-channel量化，即对于(M,K)的矩阵，有M个量化参数(M个amax)
+    qscale, dqscale = get_scale_f32(amax, 6.0)
+    quant_data = round_to_FP4(abs_data * qscale)
+    dequant_data = (quant_data * dqscale * sign).to(data.dtype)
+    return sign * quant_data
+    return dequant_data
+
+
+class CLMLinear(nn.Linear):
+    def __init__(self, *kargs, weight_bits=1, input_bits=8, **kwargs):
+        super(CLMLinear, self).__init__(*kargs, **kwargs)
+        """
+        RMSNorm is placed outside BitLinear
+        """
+        self.weight_bits = weight_bits
+        self.input_bits = input_bits
+
+    def forward(self, input):
+        quant_input = (
+            input + (activation_quant(input, self.input_bits) - input).detach()
+        )
+        quant_weight = (
+            self.weight
+            + (quant_and_dequant(self.weight, self.weight_bits) - self.weight).detach()
+        )
+
+        out = nn.functional.linear(quant_input, quant_weight)
+        if not self.bias is None:
+            out += self.bias.view(1, -1).expand_as(out)
+
+        return out
--- a/hllm/log.py
+++ b/hllm/log.py
+import os
+from hllm.config import CFG
+
+
+class TCL_dependency:
+    def __init__(self, config: CFG, name: str, file_name: str, weights_file_name: str,use_weights: bool = True):
+        self.config = config
+        self.name = name
+        self.file_name = file_name
+        self.weights_file_name = weights_file_name
+        self.use_weights = use_weights
+
+    def __str__(self):
+        if self.use_weights:
+            path = os.path.join(
+                self.config.output_dir,
+                self.name,
+                self.weights_file_name,
+                self.file_name,
+            )
+        else:
+            path = os.path.join(
+                self.config.output_dir,
+                self.name,
+                self.file_name,
+            )
+        path = os.path.abspath(path)
+        return f"{path}\n"
+
+
+class TCL:
+    def __init__(self, config: CFG, weights_file_name: str):
+        self.config = config
+        self.dependencies = []
+        self.vlist = ""
+        self.weights_file_name = weights_file_name
+    def add_dependency(self, name: str, file_name: str,use_weights: bool = True):
+        self.dependencies.append(TCL_dependency(self.config, name, file_name, self.weights_file_name,use_weights))
+
+    def set_vlist(self, vlist: str):
+        self.vlist = vlist
+
+    def generate(self):
+        tcl = ""
+        for dependency in self.dependencies:
+            tcl += f"{dependency}\n"
+        unique_lines = sorted(set(tcl.strip().split("\n")))
+        result = "\n".join(unique_lines).strip()
+        result = f'set {self.vlist} "\n' + result + '\n"'
+
+        return result
--- a/hllm/optimized/.gitignore
+++ b/hllm/optimized/.gitignore
+mapped_weights
+output
+Optimized_HN
+Optimized_HN_mux
+WT_group
+Optimized_mux
+Mux_wrapper
+Mux
+build
+optimize_HN.egg-info
+Sub_wrapper
+dist
--- a/hllm/optimized/__init__.py
+++ b/hllm/optimized/__init__.py
--- a/hllm/optimized/generate_fsm.py
+++ b/hllm/optimized/generate_fsm.py
--- a/hllm/optimized/generate_info.py
+++ b/hllm/optimized/generate_info.py
+import os
+import json
+import pickle
+import numpy as np
+from tqdm import tqdm
+from multiprocessing import Pool
+from hllm.config import CFG
+from hllm.optimized.turbo_optimize_hn import generate_color_graph, greedy_coloring
+
+
+def process_weight(args):
+    index, weight, L, H, value_range, weights_file_name, path_dir = args
+    node, graph = generate_color_graph(L, H, value_range, weight)
+    os.makedirs(path_dir, exist_ok=True)
+    node_file = os.path.join(path_dir, f"info_tp_{weights_file_name}_vc_{index}.json")
+    with open(node_file, "w") as f:
+        json.dump({"node": node}, f)
+
+    max_color = []
+    for i in range(len(value_range)):
+        colors = greedy_coloring(graph[i], node[i])
+        color_file = os.path.join(
+            path_dir, f"info_tp_{weights_file_name}_vc_{index}_value_{i}.json"
+        )
+        max_color.append(max(colors) + 1)
+        with open(color_file, "w") as f:
+            json.dump({"color": colors}, f)
+
+    max_color_file = os.path.join(
+        path_dir, f"info_tp_{weights_file_name}_vc_{index}_ww.json"
+    )
+    with open(max_color_file, "w") as f:
+        json.dump({"ww": max_color}, f)
+
+
+def run(name: str, config: CFG):
+    weights_file = os.path.join(config.mapped_weights_dir, config.run_weights)
+    weights_file_name = os.path.splitext(os.path.basename(weights_file))[0]
+    print(f"Processing {weights_file_name}")
+    with open(weights_file, "rb") as f:
+        matrixs = pickle.load(f)
+        matrixs = np.transpose(matrixs, (1, 0, 2))
+    VN, L, H = matrixs.shape
+    # print(VN, L, H)
+    path_dir = os.path.join(config.output_dir, name, weights_file_name)
+    print("Generating color graph")
+
+    args = [
+        (i, weight, L, H, config.value_range, weights_file_name, path_dir)
+        for i, weight in enumerate(matrixs)
+    ]
+    with Pool(config.num_workers) as pool:
+        list(tqdm(pool.imap(process_weight, args), total=VN))
+    print("Generating color graph at", path_dir)
--- a/hllm/optimized/generate_layer_mux.py
+++ b/hllm/optimized/generate_layer_mux.py
+# %%
+import sys
+import numpy as np
+from pyrilog import (
+    VerilogGenerator,
+    ModuleBlock,
+    GenerateBlock,
+    ForBlock,
+    add_parameter,
+    add_input,
+    add_output,
+    add_genvar,
+    add_assign,
+    add_wire,
+    add_body,
+    add_instance,
+    add_newline,
+)
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import os
+from hllm.config import CFG
+from tqdm import tqdm
+import pickle
+from hllm.utils import calculate_WW
+
+
+# %%
+def generate_module(
+    module_name,
+    L,
+    value_range,
+    WW,
+):
+    L_width = int(np.ceil(np.log2(L)))
+    with ModuleBlock(module_name) as module:
+        # 参数
+        add_parameter("L", L)
+        for i in range(len(value_range)):
+            add_parameter(f"WW_{i}", WW[i])
+        # 输入输出
+        add_input("LM_sel", L_width)
+        for i in range(len(value_range)):
+            add_input(f"LM_in_{i}", f"WW_{i}", "L")
+            add_output(f"LM_out_{i}", f"WW_{i}")
+        # 内部连线
+        for i in range(len(value_range)):
+            add_wire(name=f"LM_in_{i}_masked", width=f"WW_{i}", height="L")
+            add_wire(name=f"LM_in_{i}_masked_T", width="L", height=f"WW_{i}")
+        add_newline()
+        # LM_select_loop
+        add_genvar("i")
+        with GenerateBlock():
+            with ForBlock("i=0", "i<L", "i=i+1", "LM_select_loop"):
+                for j in range(len(value_range)):
+                    add_body(
+                        f"assign LM_in_{j}_masked[i]=LM_in_{j}[i] & {{WW_{j}{{LM_sel[i]}}}};",
+                    )
+        add_newline()
+        # LM_transpose_loop_out
+        add_genvar("j")
+        add_genvar("k")
+        with GenerateBlock():
+            with ForBlock("k=0", "k<L", "k=k+1", "LM_transpose_loop_out"):
+                for i in range(len(value_range)):
+                    with ForBlock(
+                        "j=0", f"j<WW_{i}", "j=j+1", f"LM_transpose_loop_in_{i}"
+                    ):
+                        add_assign(
+                            f"LM_in_{i}_masked_T",
+                            ["j", "k"],
+                            f"LM_in_{i}_masked",
+                            ["k", "j"],
+                        )
+        add_newline()
+        # LM_reduce_or_loop
+        add_genvar("m")
+        with GenerateBlock():
+            for i in range(len(value_range)):
+                with ForBlock("m=0", f"m<WW_{i}", "m=m+1", f"LM_reduce_or_loop_{i}"):
+                    add_body(f"assign LM_out_{i}[m] = |(LM_in_{i}_masked_T[m]);")
+
+    return module
+
+
+# %%
+def process_task(i, name, weights_file_name, matrix, H, L, config: CFG):
+    try:
+        WW = calculate_WW(matrix, config.value_range)
+        file_dir = os.path.join(config.output_dir, name, weights_file_name)
+        os.makedirs(file_dir, exist_ok=True)
+        file_name = os.path.join(file_dir, f"{name}_tp_{weights_file_name}_vc_{i}.sv")
+        module_name = f"{name}_tp_{weights_file_name}_vc_{i}"
+        with open(file_name, "w") as f:
+            f.write(
+                generate_module(
+                    module_name=module_name,
+                    L=L,
+                    value_range=config.value_range,
+                    WW=WW,
+                ).generate()
+            )
+        return i  # 返回任务ID以显示进度
+    except Exception as e:
+        print(f"Generating {i} failed with an error: {e}")
+        return None
+
+
+def run(name: str, config: CFG):
+    weights_file = os.path.join(config.weights_dir, config.run_weights)
+    weights_file_name = os.path.splitext(os.path.basename(weights_file))[0]
+    print(f"Processing {weights_file}")
+    with open(weights_file, "rb") as f:
+        matrixs = pickle.load(f)
+        matrixs = np.transpose(matrixs, (1, 0, 2))
+    VN, L, H = matrixs.shape
+    with ProcessPoolExecutor(max_workers=config.num_workers) as executor:
+        futures = [
+            executor.submit(
+                process_task, i, name, weights_file_name, matrixs[i], H, L, config
+            )
+            for i in range(VN)
+        ]
+        for future in tqdm(as_completed(futures), total=VN):
+            try:
+                result = future.result()
+            except Exception as e:
+                print(f"Generating {result} failed with an error: {e}")
+    file_dir = os.path.join(config.output_dir, name, weights_file_name)
+    print("Files generated in", file_dir)
--- a/hllm/optimized/generate_mid_wrapper.py
+++ b/hllm/optimized/generate_mid_wrapper.py
+# %%
+import json
+import sys
+import numpy as np
+from pyrilog import (
+    VerilogGenerator,
+    ModuleBlock,
+    add_parameter,
+    add_input,
+    add_output,
+    add_assign,
+    add_wire,
+    add_instance,
+)
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import os
+from hllm.config import CFG
+from tqdm import tqdm
+import pickle
+from hllm.log import TCL
+from hllm.utils import calculate_WW
+
+
+# %%
+def generate_module(
+    cur_GP=0,
+    module_name="",
+    H=16,
+    L=5,
+    VN=512,
+    value_range=[-1, 1],
+    weights_file_name=None,
+    config: CFG = None,
+    ww_list=None,
+):
+    tcl = TCL(config, weights_file_name)
+    tcl.set_vlist(f"VLIST_tp_{weights_file_name}_gp_{cur_GP}")
+    L_width = int(np.ceil(np.log2(L)))
+    with ModuleBlock(module_name) as module:
+        GN = config.group_number
+        GP = int(VN / GN)
+        # 参数
+        add_parameter("H", H)
+        add_parameter("L", L)
+        add_parameter("VN", GN)
+        # 输入输出
+        add_input("clk")
+        add_input("tree_rstn")
+        add_input("valid")
+        add_input("CST_LOW")
+        add_input("LM_sel", L_width)
+        add_input("SW_in", "H")
+        for i in range(len(value_range)):
+            add_output(name=f"WT_{i}_out_S", height="VN")
+            add_output(name=f"WT_{i}_out_C", height="VN")
+
+        # 内部连线
+        for i in range(GN):
+            sw_ports = {
+                "clk": "clk",
+                "tree_rstn": "tree_rstn",
+                "valid": "valid",
+                "CST_LOW": "CST_LOW",
+                "SW_in": "SW_in",
+                "LM_sel": "LM_sel",
+            }
+            for j in range(len(value_range)):
+                sw_ports[f"WT_{j}_out_S"] = f"WT_{j}_out_S[{i}]"
+                sw_ports[f"WT_{j}_out_C"] = f"WT_{j}_out_C[{i}]"
+            add_instance(
+                f"Sub_wrapper_tp_{weights_file_name}_vc_{cur_GP*GN+i}",
+                f"Sub_wrapper_{cur_GP*GN+i}",
+                None,
+                sw_ports,
+            )
+            tcl.add_dependency(
+                f"Sub_wrapper",
+                f"Sub_wrapper_tp_{weights_file_name}_vc_{cur_GP*GN+i}.sv",
+            )
+            tcl.add_dependency(
+                f"WT_group",
+                f"WT_group_tp_{weights_file_name}_vc_{cur_GP*GN+i}.sv",
+            )
+            tcl.add_dependency(
+                f"Mid_wrapper",
+                f"Mid_wrapper_tp_{weights_file_name}_gp_{cur_GP}.sv",
+            )
+            tcl.add_dependency(
+                f"Layer_mux",
+                f"Layer_mux_tp_{weights_file_name}_vc_{cur_GP*GN+i}.sv",
+            )
+            tcl.add_dependency(
+                f"FSM",
+                f"FSM_tp_{weights_file_name}_gp_{cur_GP}.sv",
+            )
+            for j in range(len(value_range)):
+                tcl.add_dependency(
+                    f"Mux_wrapper",
+                    f"Mux_wrapper_tp_{weights_file_name}_vc_{cur_GP*GN+i}_value_{j}.sv",
+                )
+                tcl.add_dependency(
+                    f"Mux",
+                    f"Mux_tp_{weights_file_name}_vc_{cur_GP*GN+i}_value_{j}.sv",
+                )
+            for line in ww_list[cur_GP * GN + i]:
+                tcl.add_dependency(
+                    f"SerialWallaceTree",
+                    f"SerialWallaceTree{line}Input.v",
+                    use_weights=False,
+                )
+
+    return module, tcl
+
+
+# %%
+def process_task(i, name, weights_file_name, ww_list, H, L, VN, config: CFG):
+    try:
+        file_dir = os.path.join(config.output_dir, name, weights_file_name)
+        os.makedirs(file_dir, exist_ok=True)
+        file_name = os.path.join(file_dir, f"{name}_tp_{weights_file_name}_gp_{i}.sv")
+        file_name_tcl = os.path.join(
+            file_dir, f"{name}_tp_{weights_file_name}_gp_{i}.tcl"
+        )
+        module_name = f"{name}_tp_{weights_file_name}_gp_{i}"
+        module, tcl = generate_module(
+            i,
+            module_name=module_name,
+            H=H,
+            L=L,
+            VN=VN,
+            value_range=config.value_range,
+            weights_file_name=weights_file_name,
+            config=config,
+            ww_list=ww_list,
+        )
+        with open(file_name, "w") as f:
+            f.write(module.generate())
+        with open(file_name_tcl, "w") as f:
+            f.write(tcl.generate())
+        return i  # 返回任务ID以显示进度
+    except Exception as e:
+        print(
+            f"Generating {i} failed with an error at line {sys.exc_info()[2].tb_lineno}: {e}"
+        )
+        return None
+
+
+def run(name: str, config: CFG):
+    weights_file = os.path.join(config.weights_dir, config.run_weights)
+    weights_file_name = os.path.splitext(os.path.basename(weights_file))[0]
+    print(f"Processing {weights_file}")
+    with open(weights_file, "rb") as f:
+        matrixs = pickle.load(f)
+        matrixs = np.transpose(matrixs, (1, 0, 2))
+    VN, L, H = matrixs.shape
+    GP = int(VN / config.group_number)
+
+    ww_list = []
+    ww_files = [
+        os.path.join(
+            config.output_dir,
+            "info",
+            weights_file_name,
+            f"info_tp_{weights_file_name}_vc_{i}_ww.json",
+        )
+        for i in range(VN)
+    ]
+    for ww_file in ww_files:
+        with open(ww_file, "r") as f:
+            ww = json.load(f)
+            ww_list.append(ww["ww"])
+
+    with ProcessPoolExecutor(max_workers=config.num_workers) as executor:
+        futures = [
+            executor.submit(
+                process_task, i, name, weights_file_name, ww_list, H, L, VN, config
+            )
+            for i in range(GP)
+        ]
+        for future in tqdm(as_completed(futures), total=GP):
+            try:
+                result = future.result()
+            except Exception as e:
+                print(f"Generating {result} failed with an error: {e}")
+    file_dir = os.path.join(config.output_dir, name, weights_file_name)
+    print("Files generated in", file_dir)
--- a/hllm/optimized/generate_mux.py
+++ b/hllm/optimized/generate_mux.py
+# %%
+import sys
+import numpy as np
+
+from pyrilog import (
+    ModuleBlock,
+    add_parameter,
+    add_input,
+    add_output,
+    add_assign,
+    add_body,
+    add_newline,
+    add_reg,
+    add_wire,
+    AlwaysBlock,
+    IfBlock,
+    ElseBlock,
+)
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import os
+from hllm.config import CFG
+from tqdm import tqdm
+import pickle
+import json
+
+
+# %%
+def generate_module(
+    matrix,
+    module_name="mux",
+    H=16,
+    L=5,
+    value_range=[-1, 1],
+    WW=[8, 8],
+    CUR_VN=0,
+    CUR_VALUE_INDEX=0,
+    CUR_CNT=0,
+    node=None,
+    color=None,
+    weights_file_name="",
+):
+    with ModuleBlock(f"{module_name}") as module:
+        add_input("in", H)
+        L_width = int(np.ceil(np.log2(L)))
+        add_input("sel", L_width)
+        add_output("out")
+
+        layer_to_in_map = {}
+        for i, hn_in_layers in enumerate(node):
+            if color[i] == -1 or color[i] != CUR_CNT:
+                continue
+            for j, hn_in_layer in enumerate(hn_in_layers):
+                layer_to_in_map[hn_in_layer] = i
+        add_reg("par_out", L)
+
+        with AlwaysBlock("*"):
+            for i in range(L):
+                with IfBlock(f"sel == {L_width}'b{i:0{L_width}b}"):
+                    if i in layer_to_in_map:
+                        add_body(f"par_out[{i}]=in[{layer_to_in_map[i]}];")
+                    else:
+                        add_body(f"par_out[{i}]=0;")
+                with ElseBlock():
+                    add_body(f"par_out[{i}]=0;")
+        add_assign("out", [], " | ".join([f"par_out[{i}]" for i in range(L)]), [])
+
+    return module
+
+
+# %%
+def process_task(i, name, weights_file_name, matrix, H, L, config: CFG):
+    try:
+        file_dir = os.path.join(config.output_dir, name, weights_file_name)
+        os.makedirs(file_dir, exist_ok=True)
+
+        node_file = os.path.join(
+            config.output_dir,
+            "info",
+            weights_file_name,
+            f"info_tp_{weights_file_name}_vc_{i}.json",
+        )
+        node = json.load(open(node_file))["node"]
+
+        for j in range(len(config.value_range)):
+            color_file = os.path.join(
+                config.output_dir,
+                "info",
+                weights_file_name,
+                f"info_tp_{weights_file_name}_vc_{i}_value_{j}.json",
+            )
+            color = json.load(open(color_file))["color"]
+            max_mux_port = max(color) + 1
+            text = ""
+            for k in range(max_mux_port):
+                text += generate_module(
+                    matrix,
+                    module_name=f"{name}_tp_{weights_file_name}_vc_{i}_value_{j}_color_{k}",
+                    H=H,
+                    L=L,
+                    value_range=config.value_range,
+                    node=node[j],
+                    color=color,
+                    CUR_VN=i,
+                    CUR_VALUE_INDEX=j,
+                    CUR_CNT=k,
+                    weights_file_name=weights_file_name,
+                ).generate()
+                text += "\n"
+            file_name = os.path.join(
+                file_dir,
+                f"{name}_tp_{weights_file_name}_vc_{i}_value_{j}.sv",
+            )
+            with open(file_name, "w") as f:
+                f.write(text)
+        return i  # 返回任务ID以显示进度
+    except Exception as e:
+        print(f"Generating {i} failed with an error: {e}")
+        return None
+
+
+def run(name: str, config: CFG):
+    weights_file = os.path.join(config.mapped_weights_dir, config.run_weights)
+    weights_file_name = os.path.splitext(os.path.basename(weights_file))[0]
+    print(f"Processing {weights_file_name}")
+    with open(weights_file, "rb") as f:
+        matrixs = pickle.load(f)
+        matrixs = np.transpose(matrixs, (1, 0, 2))
+    VN, L, H = matrixs.shape
+    with ProcessPoolExecutor(max_workers=config.num_workers) as executor:
+        futures = [
+            executor.submit(
+                process_task, i, name, weights_file_name, matrixs[i], H, L, config
+            )
+            for i in range(VN)
+        ]
+        for future in tqdm(as_completed(futures), total=VN):
+            try:
+                result = future.result()
+            except Exception as e:
+                print(f"Generating {result} failed with an error: {e}")
+    file_dir = os.path.join(config.output_dir, name, weights_file_name)
+    print(f"Generated {name} at {file_dir}")
+
+
+# %%
--- a/hllm/optimized/generate_mux_wrapper.py
+++ b/hllm/optimized/generate_mux_wrapper.py
+# %%
+import sys
+import numpy as np
+
+from pyrilog import (
+    ModuleBlock,
+    add_parameter,
+    add_input,
+    add_output,
+    add_assign,
+    add_body,
+    add_newline,
+    add_instance,
+    add_reg,
+    add_wire,
+    AlwaysBlock,
+    IfBlock,
+    ElseBlock,
+)
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import os
+from hllm.config import CFG
+from tqdm import tqdm
+import pickle
+import json
+
+
+# %%
+def generate_module(
+    module_name="Mux_wrapper",
+    H=16,
+    L=5,
+    value_range=[-1, 1],
+    CUR_VN=0,
+    CUR_VALUE_INDEX=0,
+    max_mux_port=0,
+    node=None,
+    color=None,
+    weights_file_name="",
+    config: CFG = None,
+    name="",
+):
+    with ModuleBlock(f"{module_name}") as module:
+        add_input("in", H)
+        L_width = int(np.ceil(np.log2(L)))
+        add_input("sel", L_width)
+        add_output("out", max_mux_port)
+
+        for i in range(max_mux_port):
+            add_instance(
+                module_name=f"Mux_tp_{weights_file_name}_vc_{CUR_VN}_value_{CUR_VALUE_INDEX}_color_{i}",
+                instance_name=f"Mux_{i}",
+                parameters={},
+                ports={"in": "in", "sel": "sel", "out": f"out[{i}]"},
+            )
+    return module
+
+
+# %%
+def process_task(i, name, weights_file_name, H, L, config: CFG):
+    try:
+        file_dir = os.path.join(config.output_dir, name, weights_file_name)
+        os.makedirs(file_dir, exist_ok=True)
+
+        node_file = os.path.join(
+            config.output_dir,
+            "info",
+            weights_file_name,
+            f"info_tp_{weights_file_name}_vc_{i}.json",
+        )
+        node = json.load(open(node_file))["node"]
+        for j in range(len(config.value_range)):
+            color_file = os.path.join(
+                config.output_dir,
+                "info",
+                weights_file_name,
+                f"info_tp_{weights_file_name}_vc_{i}_value_{j}.json",
+            )
+            color = json.load(open(color_file))["color"]
+            max_mux_port = max(color) + 1
+            text = generate_module(
+                module_name=f"{name}_tp_{weights_file_name}_vc_{i}_value_{j}",
+                H=H,
+                L=L,
+                value_range=config.value_range,
+                node=node[j],
+                color=color,
+                CUR_VN=i,
+                CUR_VALUE_INDEX=j,
+                max_mux_port=max_mux_port,
+                weights_file_name=weights_file_name,
+                config=config,
+                name=name,
+            ).generate()
+            file_name = os.path.join(
+                file_dir,
+                f"{name}_tp_{weights_file_name}_vc_{i}_value_{j}.sv",
+            )
+            with open(file_name, "w") as f:
+                f.write(text)
+        return i  # 返回任务ID以显示进度
+    except Exception as e:
+        print(f"Generating {i} failed with an error: {e}")
+        return None
+
+
+def run(name: str, config: CFG):
+    os.makedirs(config.output_dir, exist_ok=True)
+    weights_file = os.path.join(config.mapped_weights_dir, config.run_weights)
+    weights_file_name = os.path.splitext(os.path.basename(weights_file))[0]
+    print(f"Processing {weights_file}")
+    with open(weights_file, "rb") as f:
+        matrixs = pickle.load(f)
+        matrixs = np.transpose(matrixs, (1, 0, 2))
+    VN, L, H = matrixs.shape
+    with ProcessPoolExecutor(max_workers=config.num_workers) as executor:
+        futures = [
+            executor.submit(process_task, i, name, weights_file_name, H, L, config)
+            for i in range(VN)
+        ]
+        for future in tqdm(as_completed(futures), total=VN):
+            try:
+                result = future.result()
+            except Exception as e:
+                print(f"Generating {result} failed with an error: {e}")
+    file_dir = os.path.join(config.output_dir, name, weights_file_name)
+    print(f"Generated {name} at {file_dir}")
--- a/hllm/optimized/generate_sub_wrapper.py
+++ b/hllm/optimized/generate_sub_wrapper.py
+# %%
+import sys
+import numpy as np
+from pyrilog import (
+    VerilogGenerator,
+    ModuleBlock,
+    add_parameter,
+    add_input,
+    add_output,
+    add_assign,
+    add_wire,
+    add_instance,
+)
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import os
+from hllm.config import CFG
+from tqdm import tqdm
+import pickle
+import json
+
+
+# %%
+def generate_module(
+    matrix,
+    module_name="",
+    weights_file_name="",
+    H=16,
+    L=5,
+    VN_index=1,
+    value_range=[-1, 1],
+    WW=[8, 8],
+    name="",
+    config: CFG = None,
+):
+    L_width = int(np.ceil(np.log2(L)))
+    module_name_suffix = f"_tp_{weights_file_name}_vc_{VN_index}"
+    with ModuleBlock(f"{module_name}") as module:
+        # 参数
+        add_parameter("H", H)
+        add_parameter("L", L)
+        for i in range(len(value_range)):
+            add_parameter(f"WW_{i}", WW[i])
+        # 输入输出
+        add_input("clk")
+        add_input("tree_rstn")
+        add_input("valid")
+        add_input("CST_LOW")
+        add_input("LM_sel", L_width)
+        add_input("SW_in", "H")
+        for i in range(len(value_range)):
+            add_output(
+                name=f"WT_{i}_out_S",
+            )
+            add_output(
+                name=f"WT_{i}_out_C",
+            )
+        for i in range(len(value_range)):
+            add_wire(
+                name=f"LM_out_{i}",
+                width=f"WW_{i}",
+            )
+
+        # 实例化mux_wrapper
+        for j in range(len(value_range)):
+            mw_params = {}
+            mw_ports = {"in": "SW_in", "sel": "LM_sel", "out": f"LM_out_{j}"}
+            add_instance(
+                f"Mux_wrapper_tp_{weights_file_name}_vc_{VN_index}_value_{j}",
+                f"Mux_wrapper_{j}",
+                mw_params,
+                mw_ports,
+            )
+
+        # 实例化WT
+        wt_params = {}
+        for i in range(len(value_range)):
+            wt_params[f"WW_{i}"] = f"WW_{i}"
+        wt_ports = {"clk": "clk", "tree_rstn": "tree_rstn", "valid": "valid"}
+        for i in range(len(value_range)):
+            wt_ports[f"WT_{i}_in"] = f"LM_out_{i}"
+            wt_ports[f"WT_{i}_out_S"] = f"WT_{i}_out_S"
+            wt_ports[f"WT_{i}_out_C"] = f"WT_{i}_out_C"
+        add_instance("WT_group" + module_name_suffix, "WT_group", wt_params, wt_ports)
+    return module
+
+
+# %%
+def process_task(i, name, weights_file_name, matrix, H, L, config: CFG):
+    try:
+        WW = [0] * len(config.value_range)
+        for j in range(len(config.value_range)):
+            color_file = os.path.join(
+                config.output_dir,
+                "info",
+                weights_file_name,
+                f"info_tp_{weights_file_name}_vc_{i}_value_{j}.json",
+            )
+            color = json.load(open(color_file))["color"]
+            WW[j] = max(color) + 1
+        file_dir = os.path.join(config.output_dir, name, weights_file_name)
+        os.makedirs(file_dir, exist_ok=True)
+        file_name = os.path.join(file_dir, f"{name}_tp_{weights_file_name}_vc_{i}.sv")
+        with open(file_name, "w") as f:
+            module_name = f"{name}_tp_{weights_file_name}_vc_{i}"
+            f.write(
+                generate_module(
+                    matrix,
+                    module_name=module_name,
+                    H=H,
+                    weights_file_name=weights_file_name,
+                    L=L,
+                    VN_index=i,
+                    value_range=config.value_range,
+                    WW=WW,
+                    name=name,
+                    config=config,
+                ).generate()
+            )
+        return i  # 返回任务ID以显示进度
+    except Exception as e:
+        print(f"Generating {i} failed with an error: {e}")
+        return None
+
+
+def run(name, config: CFG):
+    weights_file = os.path.join(config.mapped_weights_dir, config.run_weights)
+    weights_file_name = os.path.splitext(os.path.basename(weights_file))[0]
+    print(f"Processing {weights_file_name}")
+    with open(weights_file, "rb") as f:
+        matrixs = pickle.load(f)
+        matrixs = np.transpose(matrixs, (1, 0, 2))
+        VN, L, H = matrixs.shape
+    with ProcessPoolExecutor(max_workers=config.num_workers) as executor:
+        futures = [
+            executor.submit(
+                process_task, i, name, weights_file_name, matrixs[i], H, L, config
+            )
+            for i in range(VN)
+        ]
+        for future in tqdm(as_completed(futures), total=VN):
+            try:
+                result = future.result()
+            except Exception as e:
+                print(f"Generating {result} failed with an error: {e}")
+    file_dir = os.path.join(config.output_dir, name, weights_file_name)
+    print(f"Generated {name} at {file_dir}")
--- a/hllm/optimized/generate_wrappers.py
+++ b/hllm/optimized/generate_wrappers.py
+# %%
+import sys
+import numpy as np
+from pyrilog import (
+    VerilogGenerator,
+    ModuleBlock,
+    add_parameter,
+    add_input,
+    add_output,
+    add_assign,
+    add_wire,
+    add_instance,
+)
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import os
+from hllm.config import CFG
+from tqdm import tqdm
+import pickle
+
+from hllm.utils import calculate_WW
+
+
+# %%
+def generate_module(
+    module_name,
+    H=16,
+    L=5,
+    VN=512,
+    value_range=[-1, 1],
+    weights_file_name=None,
+    config: CFG = None,
+):
+    L_width = int(np.ceil(np.log2(L)))
+    with ModuleBlock(module_name) as module:
+        GN = config.group_number
+        GP = int(VN / GN)
+        # 参数
+        add_parameter("H", H)
+        add_parameter("L", L)
+        add_parameter("VN", VN)
+        # 输入输出
+        add_input("clk")
+        add_input("tree_rstn")
+        add_input("valid")
+        add_input("CST_LOW")
+        add_input("LM_sel", L_width)
+        add_input("SW_in", "H")
+        for i in range(len(value_range)):
+            add_output(name=f"WT_{i}_out_S", height="VN")
+            add_output(name=f"WT_{i}_out_C", height="VN")
+
+        # 内部连线
+        for i in range(GP):
+            sw_ports = {
+                "clk": "clk",
+                "tree_rstn": "tree_rstn",
+                "valid": "valid",
+                "CST_LOW": "CST_LOW",
+                "SW_in": "SW_in",
+                "LM_sel": "LM_sel",
+            }
+            for j in range(len(value_range)):
+                sw_ports[f"WT_{j}_out_S"] = f"WT_{j}_out_S[{i*GN+GN-1}:{i*GN}]"
+                sw_ports[f"WT_{j}_out_C"] = f"WT_{j}_out_C[{i*GN+GN-1}:{i*GN}]"
+            add_instance(
+                f"Mid_wrapper_tp_{weights_file_name}_gp_{i}",
+                f"Mid_wrapper_{i}",
+                None,
+                sw_ports,
+            )
+    return module
+
+
+# %%
+def process_task(i, name, weights_file_name, H, L, VN, config: CFG):
+    try:
+        file_dir = os.path.join(config.output_dir, name)
+        os.makedirs(file_dir, exist_ok=True)
+        file_name = os.path.join(file_dir, f"{name}_tp_{weights_file_name}.sv")
+        module_name = f"{name}_tp_{weights_file_name}"
+        with open(file_name, "w") as f:
+            f.write(
+                generate_module(
+                    module_name,
+                    H=H,
+                    L=L,
+                    VN=VN,
+                    value_range=config.value_range,
+                    weights_file_name=weights_file_name,
+                    config=config,
+                ).generate()
+            )
+        return i  # 返回任务ID以显示进度
+    except Exception as e:
+        print(f"Generating {i} failed with an error: {e}")
+        return None
+
+
+def run(name: str, config: CFG):
+    weights_file = os.path.join(config.weights_dir, config.run_weights)
+    weights_file_name = os.path.splitext(os.path.basename(weights_file))[0]
+    print(f"Processing {weights_file}")
+    with open(weights_file, "rb") as f:
+        matrixs = pickle.load(f)
+        matrixs = np.transpose(matrixs, (1, 0, 2))
+        VN, L, H = matrixs.shape
+        with ProcessPoolExecutor(max_workers=config.num_workers) as executor:
+            futures = [
+                executor.submit(
+                    process_task,
+                    i,
+                    name,
+                    weights_file_name,
+                    H,
+                    L,
+                    VN,
+                    config,
+                )
+                for i in range(1)
+            ]
+            for future in tqdm(as_completed(futures), total=1):
+                try:
+                    result = future.result()
+                except Exception as e:
+                    print(f"Generating {result} failed with an error: {e}")
+    file_dir = os.path.join(config.output_dir, name, weights_file_name)
+    print("Files generated in", file_dir)
--- a/hllm/optimized/generate_wt_group.py
+++ b/hllm/optimized/generate_wt_group.py
+# %%
+import sys
+import numpy as np
+
+from pyrilog import (
+    ModuleBlock,
+    add_parameter,
+    add_input,
+    add_output,
+    add_assign,
+    add_body,
+    add_newline,
+    add_instance,
+    add_reg,
+    add_wire,
+    AlwaysBlock,
+    IfBlock,
+    ElseBlock,
+)
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import os
+from hllm.config import CFG
+from tqdm import tqdm
+import pickle
+import json
+
+
+# %%
+def generate_module(
+    module_name="mux_wrapper",
+    H=16,
+    L=5,
+    value_range=[-1, 1],
+    CUR_VN=0,
+    weights_file_name="",
+    config: CFG = None,
+    name="",
+):
+    with ModuleBlock(f"{module_name}") as module:
+        ww = []
+        for i in range(len(value_range)):
+            color_file = os.path.join(
+                config.output_dir,
+                "info",
+                weights_file_name,
+                f"info_tp_{weights_file_name}_vc_{CUR_VN}_value_{i}.json",
+            )
+            color = json.load(open(color_file))["color"]
+            max_mux_port = max(color) + 1
+            ww.append(max_mux_port)
+        for i in range(len(value_range)):
+            add_parameter(f"WW_{i}", ww[i])
+        add_input("clk")
+        add_input("tree_rstn")
+        add_input("valid")
+        for i in range(len(value_range)):
+            add_input(f"WT_{i}_in", f"WW_{i}")
+            add_output(
+                f"WT_{i}_out_S",
+            )
+            add_output(
+                f"WT_{i}_out_C",
+            )
+        # 内部连线华莱士树
+        for i in range(len(value_range)):
+            wallace_name = f"SerialWallaceTree{ww[i]}Input"
+            wallace_port = {
+                "clk": "clk",
+                "rstn": "tree_rstn",
+                "valid": "valid",
+                "addends": f"WT_{i}_in",
+                "out_S": f"WT_{i}_out_S",
+                "out_Cout": f"WT_{i}_out_C",
+            }
+            add_instance(wallace_name, f"serial_wallace_tree_{i}", {}, wallace_port)
+
+    return module
+
+
+# %%
+def process_task(i, name, weights_file_name, H, L, config: CFG):
+    try:
+        file_dir = os.path.join(config.output_dir, name, weights_file_name)
+        os.makedirs(file_dir, exist_ok=True)
+
+        file_name = os.path.join(
+            file_dir,
+            f"{name}_tp_{weights_file_name}_vc_{i}.sv",
+        )
+        with open(file_name, "w") as f:
+            text = generate_module(
+                module_name=f"{name}_tp_{weights_file_name}_vc_{i}",
+                H=H,
+                L=L,
+                value_range=config.value_range,
+                CUR_VN=i,
+                weights_file_name=weights_file_name,
+                config=config,
+                name=name,
+            ).generate()
+            f.write(text)
+        return i  # 返回任务ID以显示进度
+    except Exception as e:
+        print(f"Generating {i} failed with an error: {e}")
+        return None
+
+
+def run(name: str, config: CFG):
+    os.makedirs(config.output_dir, exist_ok=True)
+    weights_file = os.path.join(config.mapped_weights_dir, config.run_weights)
+    weights_file_name = os.path.splitext(os.path.basename(weights_file))[0]
+    print(f"Processing {weights_file_name}")
+    with open(weights_file, "rb") as f:
+        print(f"Loading {weights_file_name}")
+        matrixs = pickle.load(f)
+        VN, L, H = matrixs.shape
+    with ProcessPoolExecutor(max_workers=config.num_workers) as executor:
+        futures = [
+            executor.submit(process_task, i, name, weights_file_name, H, L, config)
+            for i in range(VN)
+        ]
+        for future in tqdm(as_completed(futures), total=VN):
+            try:
+                result = future.result()
+            except Exception as e:
+                print(f"Generating {result} failed with an error: {e}")
+    file_dir = os.path.join(config.output_dir, name, weights_file_name)
+    print(f"Generated {name} at {file_dir}")
+
+
+# %%
--- a/hllm/optimized/turbo_optimize_hn.cpp
+++ b/hllm/optimized/turbo_optimize_hn.cpp
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <set>
+#include <stdexcept>
+#include <vector>
+
+using namespace std;
+namespace py = pybind11;
+
+vector<int> greedyGraphColoring(const vector<vector<int>> &adjMatrix, const vector<vector<int>> &layer)
+{
+  int n = adjMatrix.size();
+  vector<int> colors(n, -1); // 存储染色结果，初始-1表示未染色
+
+  // 检查邻接矩阵有效性
+  for (const auto &row : adjMatrix)
+  {
+    if (row.size() != n)
+    {
+      throw invalid_argument("邻接矩阵必须是方阵");
+    }
+  }
+
+  // 遍历所有节点
+  for (int node = 0; node < n; ++node)
+  {
+    // 如果该节点对应的layer为空，保持颜色为-1
+    if (layer[node].empty()) {
+      continue;
+    }
+
+    set<int> usedColors;
+
+    // 检查所有相邻节点的已用颜色
+    for (int neighbor = 0; neighbor < n; ++neighbor)
+    {
+      if (adjMatrix[node][neighbor] && colors[neighbor] != -1)
+      {
+        usedColors.insert(colors[neighbor]);
+      }
+    }
+
+    // 寻找最小可用颜色
+    int color = 0;
+    while (true)
+    {
+      if (usedColors.find(color) == usedColors.end())
+      {
+        colors[node] = color;
+        break;
+      }
+      color++;
+    }
+  }
+
+  return colors;
+}
+
+bool hasIntersection(const vector<int> &node1, const vector<int> &node2)
+{
+  bool bucket[100] = {false}; // 初始化桶数组
+
+  // 将node1的元素放入桶中
+  for (int elem : node1)
+  {
+    bucket[elem] = true;
+  }
+
+  // 检查node2的元素是否在桶中存在
+  for (int elem : node2)
+  {
+    if (bucket[elem])
+    {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+tuple<vector<vector<vector<int>>>, vector<vector<vector<int>>>>
+generateColorGraph(int L, int W, const vector<double> &value_range,
+                   py::array_t<int> &matrix)
+{
+
+  auto buf = matrix.request();
+  int *ptr = static_cast<int *>(buf.ptr);
+
+  // 转置矩阵
+  vector<vector<int>> transposed(W, vector<int>(L));
+  for (int i = 0; i < L; i++)
+  {
+    for (int j = 0; j < W; j++)
+    {
+      transposed[j][i] = ptr[i * W + j];
+    }
+  }
+
+  // 初始化node和graph
+  vector<vector<vector<int>>> node(value_range.size(), vector<vector<int>>(W));
+  vector<vector<vector<int>>> graph(value_range.size(),
+                                    vector<vector<int>>(W, vector<int>(W, 0)));
+
+  // 构建node
+  for (int i = 0; i < W; i++)
+  {
+    for (int j = 0; j < L; j++)
+    {
+      int val = transposed[i][j];
+      if (val != -1)
+      {
+        node[val][i].push_back(j);
+      }
+    }
+  }
+
+  // 构建graph
+  for (size_t i = 0; i < value_range.size(); i++)
+  {
+    for (int j = 0; j < W; j++)
+    {
+      for (int k = 0; k < W; k++)
+      {
+        if (!node[i][j].empty() && !node[i][k].empty())
+        {
+          graph[i][j][k] = hasIntersection(node[i][j], node[i][k]) ? 1 : 0;
+        }
+      }
+    }
+  }
+
+  return make_tuple(node, graph);
+}
+
+PYBIND11_MODULE(turbo_optimize_hn, m)
+{
+  m.doc() = "图着色贪心算法模块";
+  m.def("greedy_coloring", &greedyGraphColoring, "基于贪心算法的图着色实现",
+        py::arg("adj_matrix"), py::arg("layer"));
+  m.def("generate_color_graph", &generateColorGraph, "生成颜色图", py::arg("L"),
+        py::arg("W"), py::arg("value_range"), py::arg("matrix"));
+}
\ No newline at end of file
--- a/hllm/origin/.python-version
+++ b/hllm/origin/.python-version
+3.11
--- a/hllm/origin/__init__.py
+++ b/hllm/origin/__init__.py
--- a/hllm/origin/generate_fsm.py
+++ b/hllm/origin/generate_fsm.py
--- a/hllm/origin/generate_hn.py
+++ b/hllm/origin/generate_hn.py
+# %%
+import sys
+import numpy as np
+from pyrilog import ModuleBlock, add_parameter, add_input, add_output, add_assign
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import os
+from hllm.config import CFG
+from tqdm import tqdm
+import pickle
+from hllm.utils import calculate_WW, find_index
+
+
+# %%
+def generate_module(
+    matrix,
+    module_name="HN",
+    H=16,
+    L=5,
+    value_range=[-1, 1],
+    WW=[8, 8],
+):
+    # with VerilogGenerator() as generator:
+    with ModuleBlock(f"{module_name}") as module:
+        add_parameter("H", H)
+        add_parameter("L", L)
+        for i in range(len(value_range)):
+            add_parameter(f"WW_{i}", WW[i])
+
+        add_input("HN_in", "H")
+        add_input("CST_LOW")
+        for i in range(len(value_range)):
+            add_output(
+                f"HN_out_{i}",
+                f"WW_{i}",
+                "L",
+            )
+        # 内部连线
+        for i, layer in enumerate(matrix):
+            weight_cnt = [0] * len(value_range)
+            for j, weight in enumerate(layer):
+                # 跳0
+                if abs(weight) < 1e-3:
+                    continue
+                try:
+                    index = find_index(value_range, weight)
+                except ValueError:
+                    print(f"weight {weight} not found")
+                    continue
+                add_assign(
+                    f"HN_out_{index}",
+                    [i, weight_cnt[index]],
+                    "HN_in",
+                    [j],
+                )
+                weight_cnt[index] += 1
+            for j in range(len(weight_cnt)):
+                while weight_cnt[j] < WW[j]:
+                    add_assign(f"HN_out_{j}", [i, weight_cnt[j]], "CST_LOW", [])
+                    weight_cnt[j] += 1
+    return module
+
+
+# %%
+def process_task(i, name, weights_file_name, matrix, H, L, config: CFG):
+    try:
+        WW = calculate_WW(matrix, config.value_range)
+        file_dir = os.path.join(config.output_dir, name, weights_file_name)
+        os.makedirs(file_dir, exist_ok=True)
+        file_name = os.path.join(file_dir, f"{name}_tp_{weights_file_name}_vc_{i}.sv")
+        module_name = f"{name}_tp_{weights_file_name}_vc_{i}"
+        with open(file_name, "w") as f:
+            text = generate_module(
+                matrix,
+                module_name=module_name,
+                H=H,
+                L=L,
+                value_range=config.value_range,
+                WW=WW,
+            ).generate()
+            f.write(text)
+        return i  # 返回任务ID以显示进度
+    except Exception as e:
+        print(f"Generating {i} failed with an error: {e}")
+        return None
+
+
+def run(name: str, config: CFG):
+    weights_file = os.path.join(config.weights_dir, config.run_weights)
+    weights_file_name = os.path.splitext(os.path.basename(weights_file))[0]
+    print(f"Processing {weights_file}")
+    with open(weights_file, "rb") as f:
+        matrixs = pickle.load(f)
+        matrixs = np.transpose(matrixs, (1, 0, 2))
+    VN, L, H = matrixs.shape
+    with ProcessPoolExecutor(max_workers=config.num_workers) as executor:
+        futures = [
+            executor.submit(
+                process_task, i, name, weights_file_name, matrixs[i], H, L, config
+            )
+            for i in range(VN)
+        ]
+        for future in tqdm(as_completed(futures), total=VN):
+            try:
+                result = future.result()
+            except Exception as e:
+                print(f"Generating {result} failed with an error: {e}")
+    file_dir = os.path.join(config.output_dir, name, weights_file_name)
+    print("Files generated in", file_dir)
--- a/hllm/origin/generate_layer_mux.py
+++ b/hllm/origin/generate_layer_mux.py
+# %%
+import sys
+import numpy as np
+from pyrilog import (
+    VerilogGenerator,
+    ModuleBlock,
+    GenerateBlock,
+    ForBlock,
+    add_parameter,
+    add_input,
+    add_output,
+    add_genvar,
+    add_assign,
+    add_wire,
+    add_body,
+    add_instance,
+    add_newline,
+)
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import os
+from hllm.config import CFG
+from tqdm import tqdm
+import pickle
+from hllm.utils import calculate_WW
+
+
+# %%
+def generate_module(
+    module_name,
+    L,
+    value_range,
+    WW,
+):
+    with ModuleBlock(module_name) as module:
+        # 参数
+        add_parameter("L", L)
+        for i in range(len(value_range)):
+            add_parameter(f"WW_{i}", WW[i])
+        # 输入输出
+        add_input("LM_sel", "L")
+        for i in range(len(value_range)):
+            add_input(f"LM_in_{i}", f"WW_{i}", "L")
+            add_output(f"LM_out_{i}", f"WW_{i}")
+        # 内部连线
+        for i in range(len(value_range)):
+            add_wire(name=f"LM_in_{i}_masked", width=f"WW_{i}", height="L")
+            add_wire(name=f"LM_in_{i}_masked_T", width="L", height=f"WW_{i}")
+        add_newline()
+        # LM_select_loop
+        add_genvar("i")
+        with GenerateBlock():
+            with ForBlock("i=0", "i<L", "i=i+1", "LM_select_loop"):
+                for j in range(len(value_range)):
+                    add_body(
+                        f"assign LM_in_{j}_masked[i]=LM_in_{j}[i] & {{WW_{j}{{LM_sel[i]}}}};",
+                    )
+        add_newline()
+        # LM_transpose_loop_out
+        add_genvar("j")
+        add_genvar("k")
+        with GenerateBlock():
+            with ForBlock("k=0", "k<L", "k=k+1", "LM_transpose_loop_out"):
+                for i in range(len(value_range)):
+                    with ForBlock(
+                        "j=0", f"j<WW_{i}", "j=j+1", f"LM_transpose_loop_in_{i}"
+                    ):
+                        add_assign(
+                            f"LM_in_{i}_masked_T",
+                            ["j", "k"],
+                            f"LM_in_{i}_masked",
+                            ["k", "j"],
+                        )
+        add_newline()
+        # LM_reduce_or_loop
+        add_genvar("m")
+        with GenerateBlock():
+            for i in range(len(value_range)):
+                with ForBlock("m=0", f"m<WW_{i}", "m=m+1", f"LM_reduce_or_loop_{i}"):
+                    add_body(f"assign LM_out_{i}[m] = |(LM_in_{i}_masked_T[m]);")
+
+    return module
+
+
+# %%
+def process_task(i, name, weights_file_name, matrix, H, L, config: CFG):
+    try:
+        WW = calculate_WW(matrix, config.value_range)
+        file_dir = os.path.join(config.output_dir, name, weights_file_name)
+        os.makedirs(file_dir, exist_ok=True)
+        file_name = os.path.join(file_dir, f"{name}_tp_{weights_file_name}_vc_{i}.sv")
+        module_name = f"{name}_tp_{weights_file_name}_vc_{i}"
+        with open(file_name, "w") as f:
+            f.write(
+                generate_module(
+                    module_name=module_name,
+                    L=L,
+                    value_range=config.value_range,
+                    WW=WW,
+                ).generate()
+            )
+        return i  # 返回任务ID以显示进度
+    except Exception as e:
+        print(f"Generating {i} failed with an error: {e}")
+        return None
+
+
+def run(name: str, config: CFG):
+    weights_file = os.path.join(config.weights_dir, config.run_weights)
+    weights_file_name = os.path.splitext(os.path.basename(weights_file))[0]
+    print(f"Processing {weights_file}")
+    with open(weights_file, "rb") as f:
+        matrixs = pickle.load(f)
+        matrixs = np.transpose(matrixs, (1, 0, 2))
+    VN, L, H = matrixs.shape
+    with ProcessPoolExecutor(max_workers=config.num_workers) as executor:
+        futures = [
+            executor.submit(
+                process_task, i, name, weights_file_name, matrixs[i], H, L, config
+            )
+            for i in range(VN)
+        ]
+        for future in tqdm(as_completed(futures), total=VN):
+            try:
+                result = future.result()
+            except Exception as e:
+                print(f"Generating {result} failed with an error: {e}")
+    file_dir = os.path.join(config.output_dir, name, weights_file_name)
+    print("Files generated in", file_dir)
--- a/hllm/origin/generate_mid_wrapper.py
+++ b/hllm/origin/generate_mid_wrapper.py
+# %%
+import sys
+import numpy as np
+from pyrilog import (
+    VerilogGenerator,
+    ModuleBlock,
+    add_parameter,
+    add_input,
+    add_output,
+    add_assign,
+    add_wire,
+    add_instance,
+)
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import os
+from hllm.config import CFG
+from tqdm import tqdm
+import pickle
+from hllm.log import TCL, TCL_dependency
+from hllm.utils import calculate_WW
+
+
+# %%
+def generate_module(
+    cur_GP=0,
+    module_name="",
+    H=16,
+    L=5,
+    VN=512,
+    value_range=[-1, 1],
+    weights_file_name=None,
+    config: CFG = None,
+    ww_list=None,
+):
+    tcl = TCL(config, weights_file_name)
+    tcl.set_vlist(f"VLIST_tp_{weights_file_name}_gp_{cur_GP}")
+    with ModuleBlock(module_name) as module:
+        GN = config.group_number
+        GP = int(VN / GN)
+        # 参数
+        add_parameter("H", H)
+        add_parameter("L", L)
+        add_parameter("VN", GN)
+        # 输入输出
+        add_input("clk")
+        add_input("tree_rstn")
+        add_input("valid")
+        add_input("CST_LOW")
+        add_input("LM_sel", "L")
+        add_input("SW_in", "H")
+        for i in range(len(value_range)):
+            add_output(name=f"WT_{i}_out_S", height="VN")
+            add_output(name=f"WT_{i}_out_C", height="VN")
+
+        # 内部连线
+        for i in range(GN):
+            sw_ports = {
+                "clk": "clk",
+                "tree_rstn": "tree_rstn",
+                "valid": "valid",
+                "CST_LOW": "CST_LOW",
+                "SW_in": "SW_in",
+                "LM_sel": "LM_sel",
+            }
+            for j in range(len(value_range)):
+                sw_ports[f"WT_{j}_out_S"] = f"WT_{j}_out_S[{i}]"
+                sw_ports[f"WT_{j}_out_C"] = f"WT_{j}_out_C[{i}]"
+            add_instance(
+                f"Sub_wrapper_tp_{weights_file_name}_vc_{cur_GP*GN+i}",
+                f"Sub_wrapper_{cur_GP*GN+i}",
+                None,
+                sw_ports,
+            )
+            tcl.add_dependency(
+                f"HN",
+                f"HN_tp_{weights_file_name}_vc_{cur_GP*GN+i}.sv",
+            )
+            tcl.add_dependency(
+                f"WT_group",
+                f"WT_group_tp_{weights_file_name}_vc_{cur_GP*GN+i}.sv",
+            )
+            tcl.add_dependency(
+                f"Layer_mux",
+                f"Layer_mux_tp_{weights_file_name}_vc_{cur_GP*GN+i}.sv",
+            )
+            tcl.add_dependency(
+                f"Sub_wrapper",
+                f"Sub_wrapper_tp_{weights_file_name}_vc_{cur_GP*GN+i}.sv",
+            )
+            tcl.add_dependency(
+                f"Mid_wrapper",
+                f"Mid_wrapper_tp_{weights_file_name}_gp_{cur_GP}.sv",
+            )
+            tcl.add_dependency(
+                f"FSM",
+                f"FSM_tp_{weights_file_name}_gp_{cur_GP}.sv",
+            )
+            for line in ww_list[cur_GP * GN + i]:
+                tcl.add_dependency(
+                    f"SerialWallaceTree",
+                    f"SerialWallaceTree{line}Input.v",
+                    use_weights=False,
+                )
+    return module, tcl
+
+
+# %%
+def process_task(i, name, weights_file_name, ww_list, H, L, VN, config: CFG):
+    try:
+        file_dir = os.path.join(config.output_dir, name, weights_file_name)
+        os.makedirs(file_dir, exist_ok=True)
+        file_name = os.path.join(file_dir, f"{name}_tp_{weights_file_name}_gp_{i}.sv")
+        file_name_tcl = os.path.join(
+            file_dir, f"{name}_tp_{weights_file_name}_gp_{i}.tcl"
+        )
+        module_name = f"{name}_tp_{weights_file_name}_gp_{i}"
+        module, tcl = generate_module(
+            i,
+            module_name=module_name,
+            H=H,
+            L=L,
+            VN=VN,
+            value_range=config.value_range,
+            weights_file_name=weights_file_name,
+            config=config,
+            ww_list=ww_list,
+        )
+        with open(file_name, "w") as f:
+            f.write(module.generate())
+        with open(file_name_tcl, "w") as f:
+            f.write(tcl.generate())
+        return i  # 返回任务ID以显示进度
+    except Exception as e:
+        print(
+            f"Generating {i} failed with an error at line {sys.exc_info()[2].tb_lineno}: {e}"
+        )
+        return None
+
+
+def run(name: str, config: CFG):
+    weights_file = os.path.join(config.weights_dir, config.run_weights)
+    weights_file_name = os.path.splitext(os.path.basename(weights_file))[0]
+    print(f"Processing {weights_file}")
+    with open(weights_file, "rb") as f:
+        matrixs = pickle.load(f)
+        matrixs = np.transpose(matrixs, (1, 0, 2))
+    VN, L, H = matrixs.shape
+    GP = int(VN / config.group_number)
+
+    ww_list = []
+    ww_files = [
+        os.path.join(
+            config.output_dir,
+            "WW",
+            weights_file_name,
+            f"WW_tp_{weights_file_name}_vc_{i}.txt",
+        )
+        for i in range(VN)
+    ]
+    for ww_file in ww_files:
+        with open(ww_file, "r") as f:
+            ww = []
+            for line in f:
+                ww.append(int(line.strip()))
+            ww_list.append(ww)
+
+    with ProcessPoolExecutor(max_workers=config.num_workers) as executor:
+        futures = [
+            executor.submit(
+                process_task, i, name, weights_file_name, ww_list, H, L, VN, config
+            )
+            for i in range(GP)
+        ]
+        for future in tqdm(as_completed(futures), total=GP):
+            try:
+                result = future.result()
+            except Exception as e:
+                print(f"Generating {result} failed with an error: {e}")
+    file_dir = os.path.join(config.output_dir, name, weights_file_name)
+    print("Files generated in", file_dir)
--- a/hllm/origin/generate_sub_wrapper.py
+++ b/hllm/origin/generate_sub_wrapper.py
+# %%
+import numpy as np
+from pyrilog import (
+    ModuleBlock,
+    add_parameter,
+    add_input,
+    add_output,
+    add_assign,
+    add_wire,
+    add_instance,
+)
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import os
+from hllm.config import CFG
+from tqdm import tqdm
+import pickle
+from hllm.utils import calculate_WW
+
+
+# %%
+def generate_module(
+    matrix,
+    module_name,
+    H=16,
+    L=5,
+    value_range=[-1, 1],
+    WW=[8, 8],
+    weights_file_name=None,
+    config: CFG = None,
+    VN_index=1,
+):
+    module_name_suffix = f"_tp_{weights_file_name}_vc_{VN_index}"
+    with ModuleBlock(module_name) as module:
+        # 参数
+        add_parameter("H", H)
+        add_parameter("L", L)
+        for i in range(len(value_range)):
+            add_parameter(f"WW_{i}", WW[i])
+        # 输入输出
+        add_input("clk")
+        add_input("tree_rstn")
+        add_input("valid")
+        add_input("CST_LOW")
+        add_input("LM_sel", "L")
+        add_input("SW_in", "H")
+        for i in range(len(value_range)):
+            add_output(
+                name=f"WT_{i}_out_S",
+            )
+            add_output(
+                name=f"WT_{i}_out_C",
+            )
+        # 内部连线
+        # add_wire("CST_LOW")
+        # add_assign("CST_LOW", [], 0, [])
+        for i in range(len(value_range)):
+            add_wire(name=f"HN_out_{i}", width=f"WW_{i}", height="L")
+            add_wire(
+                name=f"LM_out_{i}",
+                width=f"WW_{i}",
+            )
+
+        # 实例化HN
+        hn_params = {
+            "H": "H",
+            "L": "L",
+        }
+        for i in range(len(value_range)):
+            hn_params[f"WW_{i}"] = f"WW_{i}"
+        hn_ports = {
+            "HN_in": "SW_in",
+            "CST_LOW": "CST_LOW",
+        }
+        for i in range(len(value_range)):
+            hn_ports[f"HN_out_{i}"] = f"HN_out_{i}"
+        add_instance("HN" + module_name_suffix, "HN", hn_params, hn_ports)
+
+        # 实例化LM
+        lm_params = {
+            "L": L,
+        }
+        for i in range(len(value_range)):
+            lm_params[f"WW_{i}"] = f"WW_{i}"
+        lm_ports = {
+            "LM_sel": "LM_sel",
+        }
+        for i in range(len(value_range)):
+            lm_ports[f"LM_in_{i}"] = f"HN_out_{i}"
+            lm_ports[f"LM_out_{i}"] = f"LM_out_{i}"
+        add_instance("Layer_mux" + module_name_suffix, "Layer_mux", lm_params, lm_ports)
+
+        # 实例化WT
+        wt_params = {}
+        for i in range(len(value_range)):
+            wt_params[f"WW_{i}"] = f"WW_{i}"
+        wt_ports = {"clk": "clk", "tree_rstn": "tree_rstn", "valid": "valid"}
+        for i in range(len(value_range)):
+            wt_ports[f"WT_{i}_in"] = f"LM_out_{i}"
+            wt_ports[f"WT_{i}_out_S"] = f"WT_{i}_out_S"
+            wt_ports[f"WT_{i}_out_C"] = f"WT_{i}_out_C"
+        add_instance("WT_group" + module_name_suffix, "WT_group", wt_params, wt_ports)
+    return module
+
+
+# %%
+def process_task(i, name, weights_file_name, matrix, H, L, config):
+    try:
+        WW = calculate_WW(matrix, config.value_range)
+        file_dir = os.path.join(config.output_dir, name, weights_file_name)
+        os.makedirs(file_dir, exist_ok=True)
+        file_name = os.path.join(file_dir, f"{name}_tp_{weights_file_name}_vc_{i}.sv")
+        module_name = f"{name}_tp_{weights_file_name}_vc_{i}"
+        with open(file_name, "w") as f:
+            f.write(
+                generate_module(
+                    matrix,
+                    module_name=module_name,
+                    H=H,
+                    L=L,
+                    value_range=config.value_range,
+                    WW=WW,
+                    weights_file_name=weights_file_name,
+                    config=config,
+                    VN_index=i,
+                ).generate()
+            )
+        return i  # 返回任务ID以显示进度
+    except Exception as e:
+        print(f"Generating {i} failed with an error: {e}")
+        return None
+
+
+def run(name: str, config: CFG):
+    weights_file = os.path.join(config.weights_dir, config.run_weights)
+    weights_file_name = os.path.splitext(os.path.basename(weights_file))[0]
+    print(f"Processing {weights_file}")
+    with open(weights_file, "rb") as f:
+        matrixs = pickle.load(f)
+        matrixs = np.transpose(matrixs, (1, 0, 2))
+    VN, L, H = matrixs.shape
+    with ProcessPoolExecutor(max_workers=config.num_workers) as executor:
+        futures = [
+            executor.submit(
+                process_task, i, name, weights_file_name, matrixs[i], H, L, config
+            )
+            for i in range(VN)
+        ]
+        for future in tqdm(as_completed(futures), total=VN):
+            try:
+                result = future.result()
+            except Exception as e:
+                print(f"Generating {result} failed with an error: {e}")
+    file_dir = os.path.join(config.output_dir, name)
+    print("Files generated in", file_dir)
--- a/hllm/origin/generate_wallace.py
+++ b/hllm/origin/generate_wallace.py
+import argparse
+import sys
+import math
+import os
+
+from tqdm import tqdm
+
+from hllm.config import CFG
+
+
+def gen_fulladder():
+    code = """module FullAdder(
+    input A,    // First input bit
+    input B,    // Second input bit
+    input Cin,  // Carry input bit
+    output S,   // Sum output bit
+    output Cout // Carry output bit
+);
+
+assign S = A ^ B ^ Cin;
+assign Cout = (A & B) | (B & Cin) | (A & Cin);
+
+endmodule
+
+"""
+    return code
+
+
+def gen_wallace_tree_config(num_addends):
+    full_adder_list = []
+    remainder_list = []
+    total_input_list = []
+    while num_addends > 2:
+        full_adders_used = num_addends // 3
+        remaining_addends = num_addends % 3
+
+        full_adder_list.append(full_adders_used)
+        remainder_list.append(remaining_addends)
+        total_input_list.append(num_addends)
+
+        num_addends = full_adders_used * 2 + remaining_addends
+
+    return full_adder_list, remainder_list, total_input_list
+
+
+def gen_wallacetree(num_addends, full_adder_list, remainder_list, total_input_list):
+    code = ""
+
+    cout_cin_code = ""
+    for i, full_adder_count in enumerate(full_adder_list):
+        if i != len(full_adder_list) - 1:  # the final bit we will manually manage
+            cout_cin_code += f"    output [{full_adder_count} - 1 : 0] L{i}_Cout,\n"
+            cout_cin_code += f"    input [{full_adder_count} - 1 : 0] L{i+1}_Cin,\n"
+
+    module_head_code = f"""module WallaceTree{num_addends}Input(
+    input [{num_addends} - 1 : 0] addends,
+{cout_cin_code}
+    output final_Cout,
+    output final_S
+);
+"""
+    code += module_head_code
+
+    for i, (full_adder_count, remainder_count, total_input_count) in enumerate(
+        zip(full_adder_list, remainder_list, total_input_list)
+    ):
+        code += f"    wire [{total_input_count} - 1 : 0] L{i}_all_inputs;\n"
+        if i == 0:
+            code += f"    assign L{i}_all_inputs = addends;\n"
+        else:
+            last_remainder_count = remainder_list[i - 1]
+            if last_remainder_count == 0:
+                concat_code = f"{{L{i-1}_S, L{i}_Cin}}"
+            else:
+                concat_code = f"{{L{i-1}_S, L{i}_Cin, L{i-1}_remainder}}"
+
+            code += f"    assign L{i}_all_inputs = {concat_code};\n"
+
+        if remainder_count != 0:
+            code += f"    wire [{remainder_count} - 1 : 0] L{i}_remainder;\n"
+            code += f"    assign L{i}_remainder = L{i}_all_inputs[{total_input_count} - 1 : {total_input_count} - {remainder_count}];\n"
+
+        if i != len(full_adder_list) - 1:  # otherwise directly assign to output pin
+            code += f"    wire [{full_adder_count} - 1 : 0] L{i}_S;\n"
+
+        cout_code = f"L{i}_Cout" if i != len(full_adder_list) - 1 else "final_Cout"
+        S_code = f"L{i}_S" if i != len(full_adder_list) - 1 else "final_S"
+        code += f"""\
+    FullAdder L{i}_adders [{full_adder_count} - 1 : 0](
+        .A(L{i}_all_inputs[{full_adder_count} * 3 - 1 : {full_adder_count} * 2]),
+        .B(L{i}_all_inputs[{full_adder_count} * 2 - 1 : {full_adder_count}]),
+        .Cin(L{i}_all_inputs[{full_adder_count} - 1 : 0]),
+        .Cout({cout_code}),
+        .S({S_code})
+    );
+
+
+"""
+
+    code += "endmodule\n\n"
+
+    return code
+
+
+def gen_serialwallacetree(num_addends, full_adder_list):
+    code = ""
+
+    code += f"""module SerialWallaceTree{num_addends}Input(
+    input clk,
+    input rstn,
+    input valid,
+    
+    input [{num_addends} - 1 : 0] addends,
+
+    output out_S,
+    output out_Cout
+);
+
+"""
+
+    for i, full_adder_count in enumerate(full_adder_list):
+        if i != len(full_adder_list) - 1:
+            code += f"    wire [{full_adder_count} - 1 : 0] L{i}_Cout;\n"
+            code += f"    wire [{full_adder_count} - 1 : 0] L{i+1}_Cin;\n"
+            code += f"    reg [{full_adder_count} - 1 : 0] L{i}_Cout_L{i+1}_Cin_reg;\n"
+            code += f"    assign  L{i+1}_Cin = L{i}_Cout_L{i+1}_Cin_reg;\n\n"
+
+    cin_cout_assign_code = ""
+    for i, full_adder_count in enumerate(full_adder_list):
+        if i != len(full_adder_list) - 1:
+            cin_cout_assign_code += f"        .L{i}_Cout(L{i}_Cout),\n"
+            cin_cout_assign_code += f"        .L{i+1}_Cin(L{i+1}_Cin),\n"
+
+    code += "    wire final_S, final_Cout;\n"
+    code += "    assign out_S = final_S & valid;\n"
+    code += "    assign out_Cout = final_Cout & valid;\n"
+    code += f"""\
+    WallaceTree{num_addends}Input u_WallaceTree{num_addends}Input(
+        .addends(addends),
+
+{cin_cout_assign_code}
+        .final_S(final_S),
+        .final_Cout(final_Cout)
+    );
+"""
+
+    reset_code = ""
+    reg_assign_code = ""
+    for i, full_adder_count in enumerate(full_adder_list):
+        if i != len(full_adder_list) - 1:
+            reset_code += (
+                f"            L{i}_Cout_L{i+1}_Cin_reg <= {full_adder_count}'b0;\n"
+            )
+            reg_assign_code += f"            L{i}_Cout_L{i+1}_Cin_reg <= L{i}_Cout&{{{full_adder_count}{{valid}}}};\n"
+
+    code += f"""\
+    always @ (posedge clk or negedge rstn) begin
+        if (!rstn) begin
+{reset_code}
+        end
+        else begin
+{reg_assign_code}
+        end
+    end
+"""
+
+    code += "endmodule\n\n"
+
+    return code
+
+
+def run(name: str, config: CFG):
+    # Setup argument parser
+    # parser = argparse.ArgumentParser(
+    #     description="Generate Verilog code for Wallace Tree configurations."
+    # )
+    # parser.add_argument(
+    #     "num_addends", type=int, help="Number of addends for the Wallace Tree."
+    # )
+    for i in tqdm(range(1, 4000)):
+        # Generate the configuration for the Wallace Tree
+        full_adder_list, remainder_list, total_input_list = gen_wallace_tree_config(i)
+
+        # Generate the FullAdder module
+        full_adder_code = gen_fulladder()
+
+        # Generate the basic Wallace Tree
+        wallace_tree_code = gen_wallacetree(
+            i, full_adder_list, remainder_list, total_input_list
+        )
+        serial_wallace_tree_code = gen_serialwallacetree(i, full_adder_list)
+
+        # Prepare the output code
+        if i == 1:
+            output_code = full_adder_code + wallace_tree_code + serial_wallace_tree_code
+        else:
+            output_code = wallace_tree_code + serial_wallace_tree_code
+        # Output the code to a file
+        os.makedirs(os.path.join(config.output_dir, name), exist_ok=True)
+        output_filename = os.path.join(
+            config.output_dir, name, f"SerialWallaceTree{i}Input.v"
+        )
+        with open(output_filename, "w") as file:
+            file.write(output_code)
+    file_dir = os.path.join(config.output_dir, name)
+    print("Files generated in", file_dir)
--- a/hllm/origin/generate_wrappers.py
+++ b/hllm/origin/generate_wrappers.py
+# %%
+import sys
+import numpy as np
+from pyrilog import (
+    VerilogGenerator,
+    ModuleBlock,
+    add_parameter,
+    add_input,
+    add_output,
+    add_assign,
+    add_wire,
+    add_instance,
+)
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import os
+from hllm.config import CFG
+from tqdm import tqdm
+import pickle
+
+from hllm.utils import calculate_WW
+
+
+# %%
+def generate_module(
+    module_name,
+    H=16,
+    L=5,
+    VN=512,
+    value_range=[-1, 1],
+    weights_file_name=None,
+    config: CFG = None,
+):
+    with ModuleBlock(module_name) as module:
+        GN = config.group_number
+        GP = int(VN / GN)
+        # 参数
+        add_parameter("H", H)
+        add_parameter("L", L)
+        add_parameter("VN", VN)
+        # 输入输出
+        add_input("clk")
+        add_input("tree_rstn")
+        add_input("valid")
+        add_input("CST_LOW")
+        add_input("LM_sel", "L")
+        add_input("SW_in", "H")
+        for i in range(len(value_range)):
+            add_output(name=f"WT_{i}_out_S", height="VN")
+            add_output(name=f"WT_{i}_out_C", height="VN")
+
+        # 内部连线
+        for i in range(GP):
+            sw_ports = {
+                "clk": "clk",
+                "tree_rstn": "tree_rstn",
+                "valid": "valid",
+                "CST_LOW": "CST_LOW",
+                "SW_in": "SW_in",
+                "LM_sel": "LM_sel",
+            }
+            for j in range(len(value_range)):
+                sw_ports[f"WT_{j}_out_S"] = f"WT_{j}_out_S[{i*GN+GN-1}:{i*GN}]"
+                sw_ports[f"WT_{j}_out_C"] = f"WT_{j}_out_C[{i*GN+GN-1}:{i*GN}]"
+            add_instance(
+                f"Mid_wrapper_tp_{weights_file_name}_gp_{i}",
+                f"Mid_wrapper_{i}",
+                None,
+                sw_ports,
+            )
+    return module
+
+
+# %%
+def process_task(i, name, weights_file_name, H, L, VN, config: CFG):
+    try:
+        file_dir = os.path.join(config.output_dir, name)
+        os.makedirs(file_dir, exist_ok=True)
+        file_name = os.path.join(file_dir, f"{name}_tp_{weights_file_name}.sv")
+        module_name = f"{name}_tp_{weights_file_name}"
+        with open(file_name, "w") as f:
+            f.write(
+                generate_module(
+                    module_name,
+                    H=H,
+                    L=L,
+                    VN=VN,
+                    value_range=config.value_range,
+                    weights_file_name=weights_file_name,
+                    config=config,
+                ).generate()
+            )
+        return i  # 返回任务ID以显示进度
+    except Exception as e:
+        print(f"Generating {i} failed with an error: {e}")
+        return None
+
+
+def run(name: str, config: CFG):
+    weights_file = os.path.join(config.weights_dir, config.run_weights)
+    weights_file_name = os.path.splitext(os.path.basename(weights_file))[0]
+    print(f"Processing {weights_file}")
+    with open(weights_file, "rb") as f:
+        matrixs = pickle.load(f)
+        matrixs = np.transpose(matrixs, (1, 0, 2))
+        VN, L, H = matrixs.shape
+        with ProcessPoolExecutor(max_workers=config.num_workers) as executor:
+            futures = [
+                executor.submit(
+                    process_task,
+                    i,
+                    name,
+                    weights_file_name,
+                    H,
+                    L,
+                    VN,
+                    config,
+                )
+                for i in range(1)
+            ]
+            for future in tqdm(as_completed(futures), total=1):
+                try:
+                    result = future.result()
+                except Exception as e:
+                    print(f"Generating {result} failed with an error: {e}")
+    file_dir = os.path.join(config.output_dir, name, weights_file_name)
+    print("Files generated in", file_dir)
--- a/hllm/origin/generate_wt_group.py
+++ b/hllm/origin/generate_wt_group.py
+# %%
+import sys
+import numpy as np
+from pyrilog import (
+    VerilogGenerator,
+    ModuleBlock,
+    GenerateBlock,
+    ForBlock,
+    add_parameter,
+    add_input,
+    add_output,
+    add_genvar,
+    add_assign,
+    add_wire,
+    add_body,
+    add_instance,
+    add_newline,
+)
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import os
+from tqdm import tqdm
+from hllm.config import CFG
+import pickle
+from hllm.utils import calculate_WW, find_index
+
+
+def generate_module(
+    matrix,
+    module_name="",
+    H=16,
+    L=5,
+    value_range=[-1, 1],
+    WW=[8, 8],
+):
+    with ModuleBlock(f"{module_name}") as module:
+        # 参数)
+        for i in range(len(value_range)):
+            add_parameter(f"WW_{i}", WW[i])
+        # 输入输出
+        add_input("clk")
+        add_input("tree_rstn")
+        add_input("valid")
+        for i in range(len(value_range)):
+            add_input(f"WT_{i}_in", f"WW_{i}")
+            add_output(
+                f"WT_{i}_out_S",
+            )
+            add_output(
+                f"WT_{i}_out_C",
+            )
+        # 内部连线华莱士树
+        for i in range(len(value_range)):
+            wallace_name = f"SerialWallaceTree{WW[i]}Input"
+            wallace_port = {
+                "clk": "clk",
+                "rstn": "tree_rstn",
+                "valid": "valid",
+                "addends": f"WT_{i}_in",
+                "out_S": f"WT_{i}_out_S",
+                "out_Cout": f"WT_{i}_out_C",
+            }
+            add_instance(wallace_name, f"serial_wallace_tree_{i}", {}, wallace_port)
+
+    return module
+
+
+def process_task(i, name, weights_file_name, matrix, H, L, config: CFG):
+    try:
+        WW = calculate_WW(matrix, config.value_range)
+        file_dir = os.path.join(config.output_dir, name, weights_file_name)
+        os.makedirs(file_dir, exist_ok=True)
+        file_name = os.path.join(file_dir, f"{name}_tp_{weights_file_name}_vc_{i}.sv")
+        with open(file_name, "w") as f:
+            f.write(
+                generate_module(
+                    matrix,
+                    module_name=f"{name}_tp_{weights_file_name}_vc_{i}",
+                    H=H,
+                    L=L,
+                    value_range=config.value_range,
+                    WW=WW,
+                ).generate()
+            )
+        return i  # 返回任务ID以显示进度
+    except Exception as e:
+        print(f"Generating {i} failed with an error: {e}")
+        return None
+
+
+def run(name: str, config: CFG):
+    weights_file = os.path.join(config.weights_dir, config.run_weights)
+    weights_file_name = os.path.splitext(os.path.basename(weights_file))[0]
+    print(f"Processing {weights_file}")
+    with open(weights_file, "rb") as f:
+        matrixs = pickle.load(f)
+        matrixs = np.transpose(matrixs, (1, 0, 2))
+        VN, L, H = matrixs.shape
+        with ProcessPoolExecutor(max_workers=config.num_workers) as executor:
+            futures = [
+                executor.submit(
+                    process_task, i, name, weights_file_name, matrixs[i], H, L, config
+                )
+                for i in range(VN)
+            ]
+            for future in tqdm(as_completed(futures), total=VN):
+                try:
+                    result = future.result()
+                except Exception as e:
+                    print(f"Generating {result} failed with an error: {e}")
+    file_dir = os.path.join(config.output_dir, name, weights_file_name)
+    print("Files generated in", file_dir)
--- a/hllm/origin/generate_ww.py
+++ b/hllm/origin/generate_ww.py
+import sys
+import numpy as np
+from pyrilog import (
+    VerilogGenerator,
+    ModuleBlock,
+    add_parameter,
+    add_input,
+    add_output,
+    add_assign,
+    add_wire,
+    add_instance,
+)
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import os
+from hllm.config import CFG
+from tqdm import tqdm
+import pickle
+from hllm.log import TCL
+from hllm.utils import calculate_WW
+
+
+def process_task(i, name, weights_file_name, matrix, H, L, config: CFG = None):
+    try:
+        WW = calculate_WW(matrix, config.value_range)
+        file_dir = os.path.join(config.output_dir, name, weights_file_name)
+        os.makedirs(file_dir, exist_ok=True)
+        file_name = os.path.join(file_dir, f"{name}_tp_{weights_file_name}_vc_{i}.txt")
+        with open(file_name, "w") as f:
+            for ww in WW:
+                f.write(str(ww) + "\n")
+        return i  # 返回任务ID以显示进度
+    except Exception as e:
+        print(f"Generating {i} failed with an error: {e}")
+        return None
+
+
+def run(name: str, config: CFG):
+    weights_file = os.path.join(config.weights_dir, config.run_weights)
+    weights_file_name = os.path.splitext(os.path.basename(weights_file))[0]
+    print(f"Processing {weights_file}")
+    with open(weights_file, "rb") as f:
+        matrixs = pickle.load(f)
+        matrixs = np.transpose(matrixs, (1, 0, 2))
+    VN, L, H = matrixs.shape
+    with ProcessPoolExecutor(max_workers=config.num_workers) as executor:
+        futures = [
+            executor.submit(
+                process_task, i, name, weights_file_name, matrixs[i], H, L, config
+            )
+            for i in range(VN)
+        ]
+        for future in tqdm(as_completed(futures), total=VN):
+            try:
+                result = future.result()
+            except Exception as e:
+                print(f"Generating {result} failed with an error: {e}")
+    file_dir = os.path.join(config.output_dir, name, weights_file_name)
+    print("Files generated in", file_dir)
--- a/hllm/utils.py
+++ b/hllm/utils.py
+import numpy as np
+import os
+import pickle
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from tqdm import tqdm
+from hllm.config import CFG
+
+
+def calculate_WW(matrix: np.array, value_range):
+    """计算每个value_range值在矩阵中每行出现的最大次数"""
+    WW = [0] * len(value_range)
+    for i in range(len(value_range)):
+        WW[i] = max(
+            [len([x for x in row if abs(x - value_range[i]) <= 0.01]) for row in matrix]
+        )
+    return WW
+
+
+def find_index(arr, target, epsilon=1e-3):
+    """在数组中查找最接近目标值的索引"""
+    arr = np.array(arr)  # 转换为numpy数组
+    diff = np.abs(arr - target)  # 计算差值数组
+    min_diff = np.min(diff)  # 找到最小的差值
+    if min_diff < epsilon:  # 如果最小差值在允许的误差范围内
+        return np.where(diff == min_diff)[0][0]  # 返回第一个匹配的索引
+    raise ValueError("No match found")  # 如果没有找到匹配项，则引发异常
--- a/hllm/verify/__init__.py
+++ b/hllm/verify/__init__.py
--- a/hllm/verify/generate_activation.py
+++ b/hllm/verify/generate_activation.py
+import sys
+import numpy as np
+import pickle
+import os
+from tqdm import tqdm
+from hllm.config import CFG
+
+
+def to_8bit_binary(val):
+    if val < 0:
+        return f"{(1 << 8) + val:08b}"
+    else:
+        return f"{val:08b}"
+
+
+def run(config: CFG):
+    weights_file = os.path.join(config.weights_dir, config.run_weights)
+    weights_file_name = os.path.splitext(os.path.basename(weights_file))[0]
+    file_dir = os.path.join(config.output_dir, weights_file_name)
+    filename_pkl = os.path.join(file_dir, "activation.pkl")
+    filename_txt = os.path.join(file_dir, "activation.txt")
+    filename_bin_txt = os.path.join(file_dir, "activation-bin.txt")
+    if os.path.exists(filename_pkl) and not config.verify_generate_activation_on_exist:
+        print(f"Activation file {filename_pkl} already exists")
+        return
+    print(f"Generating activation for {weights_file_name} in {file_dir}")
+    os.makedirs(file_dir, exist_ok=True)
+    with open(weights_file, mode="rb") as f:
+        weights = pickle.load(f)
+        shape = weights.shape
+    length = shape[2]
+    activation = np.random.randint(-128, 128, (1, length))
+
+    with open(filename_pkl, "wb") as f:
+        pickle.dump(activation, f)
+    with open(filename_txt, "w") as f:
+        for val in activation[0]:
+            f.write(f"{val}\n")
+        f.write("\n")
+    with open(filename_bin_txt, "w") as f:
+        for val in activation[0]:
+            f.write(f"{to_8bit_binary(val)}\n")
+        f.write("\n")
--- a/hllm/verify/verify_activation.py
+++ b/hllm/verify/verify_activation.py
+import sys
+import numpy as np
+import pickle
+import os
+from tqdm import tqdm
+from hllm.config import CFG
+
+
+def to_8bit_binary(val):
+    if val < 0:
+        return f"{(1 << 8) + val:08b}"
+    else:
+        return f"{val:08b}"
+
+
+def get_bit(num, i):
+    if i < 0:
+        return 0
+    return (num >> i) & 1
+
+
+def run(config: CFG):
+    weights_file = os.path.join(config.weights_dir, config.run_weights)
+    weights_file_name = os.path.splitext(os.path.basename(weights_file))[0]
+    file_dir = os.path.join(config.output_dir, weights_file_name)
+    print("开始进行激活测试")
+    print(f"读取权重文件{weights_file_name}")
+    with open(weights_file, "rb") as f:
+        weights = pickle.load(f)
+
+    activation_file = os.path.join(file_dir, "activation.pkl")
+    with open(activation_file, "rb") as f:
+        activation = pickle.load(f)
+
+    results_txt = os.path.join(file_dir, "result.txt")
+
+    with open(results_txt, "w") as f:
+        for layer in weights:
+            for i in range(8):
+                activation_bit = get_bit(activation, i)
+                tem = np.matmul(activation_bit, layer.T)
+                for val in tem[0]:
+                    f.write(f"{val} ")
+                f.write("\n")
+        # with open(result_manual_txt, "w") as f:
+        #     for layer in matrixs:
+        #         tem = np.zeros((1, layer.shape[1]))
+        #         for i, row in enumerate(layer):
+        #             for j, val in enumerate(row):
+        #                 tem[0][j] += activation[0][i] * val
+        #         f.write(f"{tem}\n")
+    print(f"结果写入{results_txt}")
--- a/main.py
+++ b/main.py
+from hllm.config import CFG
+
+
+def run_origin(config: CFG):
+    import hllm.origin.generate_layer_mux as generate_layer_mux
+    import hllm.origin.generate_hn as generate_hn
+    import hllm.origin.generate_mid_wrapper as generate_mid_wrapper
+    import hllm.origin.generate_fsm as generate_fsm
+    import hllm.origin.generate_sub_wrapper as generate_sub_wrapper
+    import hllm.origin.generate_wallace as generate_wallace
+    import hllm.origin.generate_wrappers as generate_wrappers
+    import hllm.origin.generate_wt_group as generate_wt_group
+    import hllm.origin.generate_ww as generate_ww
+
+    config.output_dir = "outputs-qwen/origin"
+    generate_ww.run(name="WW", config=config)
+    generate_layer_mux.run(name="Layer_mux", config=config)
+    generate_hn.run(name="HN", config=config)
+    generate_mid_wrapper.run(name="Mid_wrapper", config=config)
+    generate_fsm.run(name="FSM", config=config)
+    generate_sub_wrapper.run(name="Sub_wrapper", config=config)
+    generate_wallace.run(name="SerialWallaceTree", config=config)
+    generate_wrappers.run(name="Wrappers", config=config)
+    generate_wt_group.run(name="WT_group", config=config)
+
+
+def run_optimized(config: CFG):
+    import hllm.origin.generate_wallace as generate_wallace
+    import hllm.optimized.generate_info as generate_info
+    import hllm.optimized.generate_mux_wrapper as generate_mux_wrapper
+    import hllm.optimized.generate_mux as generate_mux
+    import hllm.optimized.generate_sub_wrapper as generate_sub_wrapper
+    import hllm.optimized.generate_wt_group as generate_wt_group
+    import hllm.optimized.generate_mid_wrapper as generate_mid_wrapper
+    import hllm.optimized.generate_wrappers as generate_wrappers
+    import hllm.optimized.generate_layer_mux as generate_layer_mux
+    import hllm.optimized.generate_fsm as generate_fsm
+
+    config.output_dir = "outputs-qwen/optimized"
+    generate_info.run(name="info", config=config)
+    generate_mux_wrapper.run(name="Mux_wrapper", config=config)
+    generate_mux.run(name="Mux", config=config)
+    generate_sub_wrapper.run(name="Sub_wrapper", config=config)
+    generate_wt_group.run(name="WT_group", config=config)
+    generate_mid_wrapper.run(name="Mid_wrapper", config=config)
+    generate_wallace.run(name="SerialWallaceTree", config=config)
+    generate_wrappers.run(name="Wrappers", config=config)
+    generate_layer_mux.run(name="Layer_mux", config=config)
+    generate_fsm.run(name="FSM", config=config)
+
+
+def run_weights_preprocess(config: CFG):
+    import hllm.eda.generate_quant_weights as generate_quant_weights
+    import hllm.eda.mapping_weights as generate_mapping_weights
+    
+    generate_quant_weights.run(config=config)
+    generate_mapping_weights.run(config=config)
+    
+
+
+def run_verify(config: CFG):
+    import hllm.verify.generate_activation as generate_activation
+    import hllm.verify.verify_activation as verify_activation
+
+    config.output_dir = "outputs-qwen/verify"
+    generate_activation.run(config=config)
+    verify_activation.run(config=config)
+
+
+def batch_run(config: CFG):
+    for weights in config.run_weights_batch:
+        config.run_weights = weights
+        run_origin(config)
+        run_optimized(config)
+        run_verify(config)
+
+
+if __name__ == "__main__":
+    config = CFG()
+    # run_weights_preprocess(config)
+    # run_origin(config)
+    # run_optimized(config)
+    # run_verify()
+    batch_run(config)
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
+from setuptools import setup, find_packages, Extension
+import pybind11
+
+ext_modules = [
+    Extension(
+        "hllm.optimized.turbo_optimize_hn",  # 注意这里的模块路径要匹配包结构
+        ["hllm/optimized/turbo_optimize_hn.cpp"],
+        include_dirs=[pybind11.get_include()],
+        language="c++",
+        extra_compile_args=["-std=c++11", "-fPIC", "-O3"],
+        extra_link_args=["-static-libstdc++"],
+    ),
+]
+
+setup(
+    name="hllm",
+    version="0.1.0",
+    packages=find_packages(),
+    install_requires=[
+        "pybind11>=2.6.0",
+    ],
+    ext_modules=ext_modules,
+)
--- a/templates/FSM.sv
+++ b/templates/FSM.sv
--- a/templates/HN.sv
+++ b/templates/HN.sv
--- a/templates/MW_tp_k_gp_0.sv
+++ b/templates/MW_tp_k_gp_0.sv
+module Mid_wrappers_tp_k #(
+    parameter H  = 16,  //这些数值无所谓
+    parameter L  = 2,
+    parameter VN = 16
+) (
+    input clk,
+    input tree_rstn,
+    input valid,
+    input CST_LOW,
+    input [L - 1 : 0] LM_sel,
+    input [H - 1 : 0] SW_in,
+    output WT_v0_out_S[VN - 1 : 0],
+    output WT_v0_out_C[VN - 1 : 0],
+    output WT_v1_out_S[VN - 1 : 0],
+    output WT_v1_out_C[VN - 1 : 0],
+    output WT_v2_out_S[VN - 1 : 0],
+    output WT_v2_out_C[VN - 1 : 0],
+    output WT_v3_out_S[VN - 1 : 0],
+    output WT_v3_out_C[VN - 1 : 0],
+    output WT_v4_out_S[VN - 1 : 0],
+    output WT_v4_out_C[VN - 1 : 0],
+    output WT_v5_out_S[VN - 1 : 0],
+    output WT_v5_out_C[VN - 1 : 0],
+    output WT_v6_out_S[VN - 1 : 0],
+    output WT_v6_out_C[VN - 1 : 0],
+    output WT_v7_out_S[VN - 1 : 0],
+    output WT_v7_out_C[VN - 1 : 0],
+    output WT_v8_out_S[VN - 1 : 0],
+    output WT_v8_out_C[VN - 1 : 0],
+    output WT_v9_out_S[VN - 1 : 0],
+    output WT_v9_out_C[VN - 1 : 0],
+    output WT_v10_out_S[VN - 1 : 0],
+    output WT_v10_out_C[VN - 1 : 0],
+    output WT_v11_out_S[VN - 1 : 0],
+    output WT_v11_out_C[VN - 1 : 0],
+    output WT_v12_out_S[VN - 1 : 0],
+    output WT_v12_out_C[VN - 1 : 0],
+    output WT_v13_out_S[VN - 1 : 0],
+    output WT_v13_out_C[VN - 1 : 0]
+);
+  Sub_wrapper_tp_k_vc_0 sub_wrapper_0  //名称手动迭代下，还有下面的索引
+  (
+      .clk(clk),
+      .tree_rstn(tree_rstn),
+      .valid(valid),
+      .CST_LOW(CST_LOW),
+      .LM_sel(LM_sel),
+      .SW_in(SW_in),
+      .WT_0_out_S(WT_v0_out_S[0]),
+      .WT_0_out_C(WT_v0_out_C[0]),
+      .WT_1_out_S(WT_v1_out_S[0]),
+      .WT_1_out_C(WT_v1_out_C[0]),
+      .WT_2_out_S(WT_v2_out_S[0]),
+      .WT_2_out_C(WT_v2_out_C[0]),
+      .WT_3_out_S(WT_v3_out_S[0]),
+      .WT_3_out_C(WT_v3_out_C[0]),
+      .WT_4_out_S(WT_v4_out_S[0]),
+      .WT_4_out_C(WT_v4_out_C[0]),
+      .WT_5_out_S(WT_v5_out_S[0]),
+      .WT_5_out_C(WT_v5_out_C[0]),
+      .WT_6_out_S(WT_v6_out_S[0]),
+      .WT_6_out_C(WT_v6_out_C[0]),
+      .WT_7_out_S(WT_v7_out_S[0]),
+      .WT_7_out_C(WT_v7_out_C[0]),
+      .WT_8_out_S(WT_v8_out_S[0]),
+      .WT_8_out_C(WT_v8_out_C[0]),
+      .WT_9_out_S(WT_v9_out_S[0]),
+      .WT_9_out_C(WT_v9_out_C[0]),
+      .WT_10_out_S(WT_v10_out_S[0]),
+      .WT_10_out_C(WT_v10_out_C[0]),
+      .WT_11_out_S(WT_v11_out_S[0]),
+      .WT_11_out_C(WT_v11_out_C[0]),
+      .WT_12_out_S(WT_v12_out_S[0]),
+      .WT_12_out_C(WT_v12_out_C[0]),
+      .WT_13_out_S(WT_v13_out_S[0]),
+      .WT_13_out_C(WT_v13_out_C[0])
+  );
+
+  Sub_wrapper_tp_k_vc_1 sub_wrapper_1 (
+      .clk(clk),
+      .tree_rstn(tree_rstn),
+      .valid(valid),
+      .CST_LOW(CST_LOW),
+      .LM_sel(LM_sel),
+      .SW_in(SW_in),
+      .WT_0_out_S(WT_v0_out_S[1]),
+      .WT_0_out_C(WT_v0_out_C[1]),
+      .WT_1_out_S(WT_v1_out_S[1]),
+      .WT_1_out_C(WT_v1_out_C[1]),
+      .WT_2_out_S(WT_v2_out_S[1]),
+      .WT_2_out_C(WT_v2_out_C[1]),
+      .WT_3_out_S(WT_v3_out_S[1]),
+      .WT_3_out_C(WT_v3_out_C[1]),
+      .WT_4_out_S(WT_v4_out_S[1]),
+      .WT_4_out_C(WT_v4_out_C[1]),
+      .WT_5_out_S(WT_v5_out_S[1]),
+      .WT_5_out_C(WT_v5_out_C[1]),
+      .WT_6_out_S(WT_v6_out_S[1]),
+      .WT_6_out_C(WT_v6_out_C[1]),
+      .WT_7_out_S(WT_v7_out_S[1]),
+      .WT_7_out_C(WT_v7_out_C[1]),
+      .WT_8_out_S(WT_v8_out_S[1]),
+      .WT_8_out_C(WT_v8_out_C[1]),
+      .WT_9_out_S(WT_v9_out_S[1]),
+      .WT_9_out_C(WT_v9_out_C[1]),
+      .WT_10_out_S(WT_v10_out_S[1]),
+      .WT_10_out_C(WT_v10_out_C[1]),
+      .WT_11_out_S(WT_v11_out_S[1]),
+      .WT_11_out_C(WT_v11_out_C[1]),
+      .WT_12_out_S(WT_v12_out_S[1]),
+      .WT_12_out_C(WT_v12_out_C[1]),
+      .WT_13_out_S(WT_v13_out_S[1]),
+      .WT_13_out_C(WT_v13_out_C[1])
+  );
+endmodule
--- a/templates/QW_FSM_tp_k_gp_0_0127.sv
+++ b/templates/QW_FSM_tp_k_gp_0_0127.sv
--- a/templates/SW.sv
+++ b/templates/SW.sv
--- a/templates/WallaceTree8Input_ser.v
+++ b/templates/WallaceTree8Input_ser.v
+module FullAdder(
+    input A,    // First input bit
+    input B,    // Second input bit
+    input Cin,  // Carry input bit
+    output S,   // Sum output bit
+    output Cout // Carry output bit
+);
+
+assign S = A ^ B ^ Cin;
+assign Cout = (A & B) | (B & Cin) | (A & Cin);
+
+endmodule
+
+
+
+module WallaceTree8Input(
+    input [8 - 1 : 0] addends,
+    output [2 - 1 : 0] L0_Cout,
+    input [2 - 1 : 0] L1_Cin,
+    output [2 - 1 : 0] L1_Cout,
+    input [2 - 1 : 0] L2_Cin,
+    output [1 - 1 : 0] L2_Cout,
+    input [1 - 1 : 0] L3_Cin,
+    output final_Cout,
+    output final_S
+);
+    wire [8 - 1 : 0] L0_all_inputs;
+    assign L0_all_inputs = addends;
+    wire [2 - 1 : 0] L0_remainder;
+    assign L0_remainder = L0_all_inputs[8 - 1 : 8 - 2];
+    wire [2 - 1 : 0] L0_S;
+    FullAdder L0_adders [2 - 1 : 0](
+        .A(L0_all_inputs[2 * 3 - 1 : 2 * 2]),
+        .B(L0_all_inputs[2 * 2 - 1 : 2]),
+        .Cin(L0_all_inputs[2 - 1 : 0]),
+        .Cout(L0_Cout),
+        .S(L0_S)
+    );
+
+
+    wire [6 - 1 : 0] L1_all_inputs;
+    assign L1_all_inputs = {L0_S, L1_Cin, L0_remainder};
+    wire [2 - 1 : 0] L1_S;
+    FullAdder L1_adders [2 - 1 : 0](
+        .A(L1_all_inputs[2 * 3 - 1 : 2 * 2]),
+        .B(L1_all_inputs[2 * 2 - 1 : 2]),
+        .Cin(L1_all_inputs[2 - 1 : 0]),
+        .Cout(L1_Cout),
+        .S(L1_S)
+    );
+
+
+    wire [4 - 1 : 0] L2_all_inputs;
+    assign L2_all_inputs = {L1_S, L2_Cin};
+    wire [1 - 1 : 0] L2_remainder;
+    assign L2_remainder = L2_all_inputs[4 - 1 : 4 - 1];
+    wire [1 - 1 : 0] L2_S;
+    FullAdder L2_adders [1 - 1 : 0](
+        .A(L2_all_inputs[1 * 3 - 1 : 1 * 2]),
+        .B(L2_all_inputs[1 * 2 - 1 : 1]),
+        .Cin(L2_all_inputs[1 - 1 : 0]),
+        .Cout(L2_Cout),
+        .S(L2_S)
+    );
+
+
+    wire [3 - 1 : 0] L3_all_inputs;
+    assign L3_all_inputs = {L2_S, L3_Cin, L2_remainder};
+    FullAdder L3_adders [1 - 1 : 0](
+        .A(L3_all_inputs[1 * 3 - 1 : 1 * 2]),
+        .B(L3_all_inputs[1 * 2 - 1 : 1]),
+        .Cin(L3_all_inputs[1 - 1 : 0]),
+        .Cout(final_Cout),
+        .S(final_S)
+    );
+
+
+endmodule
+
+module SerialWallaceTree8Input(
+    input clk,
+    input rstn,
+    input valid,
+    
+    input [8 - 1 : 0] addends,
+
+    output out_S,
+    output out_Cout
+);
+
+    wire [2 - 1 : 0] L0_Cout;
+    wire [2 - 1 : 0] L1_Cin;
+    reg [2 - 1 : 0] L0_Cout_L1_Cin_reg;
+    assign  L1_Cin = L0_Cout_L1_Cin_reg;
+
+    wire [2 - 1 : 0] L1_Cout;
+    wire [2 - 1 : 0] L2_Cin;
+    reg [2 - 1 : 0] L1_Cout_L2_Cin_reg;
+    assign  L2_Cin = L1_Cout_L2_Cin_reg;
+
+    wire [1 - 1 : 0] L2_Cout;
+    wire [1 - 1 : 0] L3_Cin;
+    reg [1 - 1 : 0] L2_Cout_L3_Cin_reg;
+    assign  L3_Cin = L2_Cout_L3_Cin_reg;
+
+    wire final_S, final_Cout;
+    assign out_S = final_S & valid;
+    assign out_Cout = final_Cout & valid;
+    WallaceTree8Input u_WallaceTree8Input(
+        .addends(addends),
+
+        .L0_Cout(L0_Cout),
+        .L1_Cin(L1_Cin),
+        .L1_Cout(L1_Cout),
+        .L2_Cin(L2_Cin),
+        .L2_Cout(L2_Cout),
+        .L3_Cin(L3_Cin),
+
+        .final_S(final_S),
+        .final_Cout(final_Cout)
+    );
+    always @ (posedge clk) begin
+        if (!rstn) begin
+            L0_Cout_L1_Cin_reg <= 2'b0;
+            L1_Cout_L2_Cin_reg <= 2'b0;
+            L2_Cout_L3_Cin_reg <= 1'b0;
+
+        end
+        else if (valid) begin
+            L0_Cout_L1_Cin_reg <= L0_Cout;
+            L1_Cout_L2_Cin_reg <= L1_Cout;
+            L2_Cout_L3_Cin_reg <= L2_Cout;
+
+        end
+    end
+endmodule
--- a/templates/Wrappers_tp_k.sv
+++ b/templates/Wrappers_tp_k.sv
+module Wrappers_tp_k #(
+    parameter H  = 16,  //这些数值无所谓
+    parameter L  = 2,
+    parameter VN = 2
+) (
+    input clk,
+    input tree_rstn,
+    input valid,
+    input CST_LOW,
+    input [L - 1 : 0] LM_sel,
+    input [H - 1 : 0] SW_in,
+    output WT_v0_out_S[VN - 1 : 0],
+    output WT_v0_out_C[VN - 1 : 0],
+    output WT_v1_out_S[VN - 1 : 0],
+    output WT_v1_out_C[VN - 1 : 0],
+    output WT_v2_out_S[VN - 1 : 0],
+    output WT_v2_out_C[VN - 1 : 0],
+    output WT_v3_out_S[VN - 1 : 0],
+    output WT_v3_out_C[VN - 1 : 0],
+    output WT_v4_out_S[VN - 1 : 0],
+    output WT_v4_out_C[VN - 1 : 0],
+    output WT_v5_out_S[VN - 1 : 0],
+    output WT_v5_out_C[VN - 1 : 0],
+    output WT_v6_out_S[VN - 1 : 0],
+    output WT_v6_out_C[VN - 1 : 0],
+    output WT_v7_out_S[VN - 1 : 0],
+    output WT_v7_out_C[VN - 1 : 0],
+    output WT_v8_out_S[VN - 1 : 0],
+    output WT_v8_out_C[VN - 1 : 0],
+    output WT_v9_out_S[VN - 1 : 0],
+    output WT_v9_out_C[VN - 1 : 0],
+    output WT_v10_out_S[VN - 1 : 0],
+    output WT_v10_out_C[VN - 1 : 0],
+    output WT_v11_out_S[VN - 1 : 0],
+    output WT_v11_out_C[VN - 1 : 0],
+    output WT_v12_out_S[VN - 1 : 0],
+    output WT_v12_out_C[VN - 1 : 0],
+    output WT_v13_out_S[VN - 1 : 0],
+    output WT_v13_out_C[VN - 1 : 0]
+);
+
+  Mid_wrapper_tp_k_gp_0 mid_wrapper_0  //名称手动迭代下，还有下面的索引
+  (
+      .clk(clk),
+      .tree_rstn(tree_rstn),
+      .valid(valid),
+      .CST_LOW(CST_LOW),
+      .LM_sel(LM_sel),
+      .SW_in(SW_in),
+      .WT_0_out_S(WT_v0_out_S[15:0]),
+      .WT_0_out_C(WT_v0_out_C[0]),
+      .WT_1_out_S(WT_v1_out_S[0]),
+      .WT_1_out_C(WT_v1_out_C[0]),
+      .WT_2_out_S(WT_v2_out_S[0]),
+      .WT_2_out_C(WT_v2_out_C[0]),
+      .WT_3_out_S(WT_v3_out_S[0]),
+      .WT_3_out_C(WT_v3_out_C[0]),
+      .WT_4_out_S(WT_v4_out_S[0]),
+      .WT_4_out_C(WT_v4_out_C[0]),
+      .WT_5_out_S(WT_v5_out_S[0]),
+      .WT_5_out_C(WT_v5_out_C[0]),
+      .WT_6_out_S(WT_v6_out_S[0]),
+      .WT_6_out_C(WT_v6_out_C[0]),
+      .WT_7_out_S(WT_v7_out_S[0]),
+      .WT_7_out_C(WT_v7_out_C[0]),
+      .WT_8_out_S(WT_v8_out_S[0]),
+      .WT_8_out_C(WT_v8_out_C[0]),
+      .WT_9_out_S(WT_v9_out_S[0]),
+      .WT_9_out_C(WT_v9_out_C[0]),
+      .WT_10_out_S(WT_v10_out_S[0]),
+      .WT_10_out_C(WT_v10_out_C[0]),
+      .WT_11_out_S(WT_v11_out_S[0]),
+      .WT_11_out_C(WT_v11_out_C[0]),
+      .WT_12_out_S(WT_v12_out_S[0]),
+      .WT_12_out_C(WT_v12_out_C[0]),
+      .WT_13_out_S(WT_v13_out_S[0]),
+      .WT_13_out_C(WT_v13_out_C[0])
+  );
+
+  Mid_wrapper_tp_k_gp_1 mid_wrapper_1 (
+      .clk(clk),
+      .tree_rstn(tree_rstn),
+      .valid(valid),
+      .CST_LOW(CST_LOW),
+      .LM_sel(LM_sel),
+      .SW_in(SW_in),
+      .WT_0_out_S(WT_v0_out_S[1]),
+      .WT_0_out_C(WT_v0_out_C[1]),
+      .WT_1_out_S(WT_v1_out_S[1]),
+      .WT_1_out_C(WT_v1_out_C[1]),
+      .WT_2_out_S(WT_v2_out_S[1]),
+      .WT_2_out_C(WT_v2_out_C[1]),
+      .WT_3_out_S(WT_v3_out_S[1]),
+      .WT_3_out_C(WT_v3_out_C[1]),
+      .WT_4_out_S(WT_v4_out_S[1]),
+      .WT_4_out_C(WT_v4_out_C[1]),
+      .WT_5_out_S(WT_v5_out_S[1]),
+      .WT_5_out_C(WT_v5_out_C[1]),
+      .WT_6_out_S(WT_v6_out_S[1]),
+      .WT_6_out_C(WT_v6_out_C[1]),
+      .WT_7_out_S(WT_v7_out_S[1]),
+      .WT_7_out_C(WT_v7_out_C[1]),
+      .WT_8_out_S(WT_v8_out_S[1]),
+      .WT_8_out_C(WT_v8_out_C[1]),
+      .WT_9_out_S(WT_v9_out_S[1]),
+      .WT_9_out_C(WT_v9_out_C[1]),
+      .WT_10_out_S(WT_v10_out_S[1]),
+      .WT_10_out_C(WT_v10_out_C[1]),
+      .WT_11_out_S(WT_v11_out_S[1]),
+      .WT_11_out_C(WT_v11_out_C[1]),
+      .WT_12_out_S(WT_v12_out_S[1]),
+      .WT_12_out_C(WT_v12_out_C[1]),
+      .WT_13_out_S(WT_v13_out_S[1]),
+      .WT_13_out_C(WT_v13_out_C[1])
+  );
+endmodule
--- a/templates/inner_product.py
+++ b/templates/inner_product.py
+def read_mem_file(filepath):
+    with open(filepath, 'r') as file:
+        lines = file.readlines()
+        # 去除每行数据中的下划线并转换为十进制数
+        mem_vector = [int(line.strip().replace('_', ''), 2) for line in lines]
+    return mem_vector
+
+def read_weight_file(filepath):
+    with open(filepath, 'r') as file:
+        lines = file.readlines()
+        # 读取每个权重并转换为整数
+        weight_vector = [int(line.strip()) for line in lines]
+    return weight_vector
+
+def read_result_file(filepath):
+    with open(filepath, 'r') as file:
+        # 读取第一行并将其作为二进制数进行解释
+        result_binary_str = file.readline().strip()
+        # 将二进制数作为有符号数转换为十进制数
+        result_decimal = int(result_binary_str, 2)
+        # 如果二进制数是负数，则需要进行二补码转换
+        if result_decimal >= 2 ** (len(result_binary_str) - 1):
+            result_decimal -= 2 ** len(result_binary_str)
+    return result_decimal
+
+def vector_multiplication(mem_vector, weight_vector):
+    # 对位乘法
+    product_vector = [m * w for m, w in zip(mem_vector, weight_vector)]
+    return product_vector
+
+def main():
+    mem_vector = read_mem_file('F:/another-D/vivao/vivado_project/project_10_wallace_FSM_MUX/mem.txt')
+    weight_vector = read_weight_file('F:/another-D/vivao/vivado_project/project_10_wallace_FSM_MUX/weight.txt')
+
+    # 对位乘法
+    product_vector = vector_multiplication(mem_vector, weight_vector)
+
+    print("对位乘法结果：")
+    for i, product in enumerate(product_vector):
+        print(f"mem[{i}] * weight[{i}] = {mem_vector[i]} * {weight_vector[i]} = {product}")
+
+    # 总和
+    total_sum = sum(product_vector)
+    print(f"\n总和: {total_sum}; 二进制表示: {bin(total_sum)}")
+
+
+    # 读取result.txt中的二进制数并转换为十进制数
+    result_decimal = read_result_file('F:/another-D/vivao/vivado_project/project_10_wallace_FSM_MUX/result.txt')
+    print(f"\n从result.txt读取的二进制数对应的十进制值为: {result_decimal}\n")
+
+    # 对比result.txt中的值与部分和总和
+    if result_decimal == total_sum:
+        print("相同✓\n")
+    else:
+        print("不同×\n")
+
+if __name__ == "__main__":
+    main()
--- a/templates/mux.sv
+++ b/templates/mux.sv
+module optimized_mux_tp_o_vc_1_value_3_cnt_0 (
+    input [1536 - 1:0] in,  // 64-bit input signals
+    input [5:0] sel,  // 6-bit binary selector signal
+    output out  // Selected output
+);
+
+  //参考原有逻辑，然后没得连的就连0
+
+endmodule
--- a/templates/mux_52to1_binary.sv
+++ b/templates/mux_52to1_binary.sv
--- a/templates/mux_wrapper.sv
+++ b/templates/mux_wrapper.sv
--- a/templates/tb_top_FSM.sv
+++ b/templates/tb_top_FSM.sv
--- a/templates/wtgroup.sv
+++ b/templates/wtgroup.sv