update

22f6d271 · nanziyuan · cd7f6e7f · 22f6d271 · 22f6d271 · 22f6d271
Commit 22f6d271 authored Mar 24, 2026 by nanziyuan
Hide whitespace changes
Inline Side-by-side

Showing with 1174 additions and 408 deletions

agent_bench.py
+886 -0

mini_spec.yaml
+163 -408

readme.md
+125 -0

No files found.
--- a/agent_bench.py
+++ b/agent_bench.py
+#!/usr/bin/env python3
+"""
+agent_bench.py - Benchmark management tool for RTL agent evaluation.
+
+This script manages the full workflow for evaluating RTL generation agents:
+1. Generate benchmark tasks (mk_bench)
+2. Evaluate model directly (run) - Calls LLM API, extracts Verilog code from response
+3. Evaluate agent workflow (agent_run) - Uses mini-swe-agent framework
+4. Collect results (collect)
+5. Verify solutions (evaluate.py)
+6. Pretty print results (pretty_print)
+
+Subcommands:
+
+  mk_bench    Generate benchmark tasks from project directories.
+              Creates a repository structure with verification files for each
+              module found in the projects.
+
+              Normal mode: Copies documentation to doc/ subdirectory.
+              Pure code mode (--pure-code): Copies the module's Verilog file
+              instead of documentation (for code-to-code tasks).
+
+              Usage: python agent_bench.py mk_bench --target <dir> [--pure-code]
+
+  run         Test base reasoning model using litellm.
+              Directly calls the LLM API with task prompts to evaluate the
+              base model's ability without agent workflow.
+
+              The model is instructed to wrap Verilog code in ```verilog blocks.
+              The script extracts the pure code for the "code" field and saves
+              the raw response in the "raw_response" field.
+
+              Usage: python agent_bench.py run <repo> [output] --model <model> [options]
+
+              Options:
+                --base_url <url>       API base URL (default: http://localhost:8000/v1)
+                --api_key <key>        API key for authentication
+                --workers <n>          Number of parallel workers (default: 1)
+                --n_samples <n>        Samples per module (default: 1)
+                --disable_thinking     Disable thinking mode (enabled by default)
+                --timeout <secs>       Request timeout in seconds (default: 1800)
+
+              Example:
+                python agent_bench.py run ./agent/repo ./samples/Qwen3.5 \\
+                    --model "openai/Qwen3.5-35B-A3B" \\
+                    --base_url "http://localhost:30000/v1" \\
+                    --workers 2 --n_samples 8
+
+  agent_run   Run agent benchmark using mini-swe-agent.
+              Uses the mini-swe-agent framework to solve tasks with tool use
+              and iterative refinement. Each sample runs in an isolated
+              working directory to prevent file conflicts.
+
+              For N samples, creates N isolated copies (work_dir/sample_i/)
+              of the repository. Each copy runs independently.
+
+              Trajectory files (run.traj.json) are saved in each module
+              directory for later analysis.
+
+              Config is copied to work_dir/config.yaml as a backup.
+
+              Usage: python agent_bench.py agent_run <repo> <work_dir> [options]
+
+              Options:
+                -c, --config <path>    Path to mini_code.yaml config file
+                -j, --workers <n>      Number of parallel workers (default: 4)
+                --n_samples <n>        Number of isolated samples (default: 1)
+                --timeout <secs>       Timeout per task in seconds (default: 1800)
+                --resume               Skip samples with existing trajectory files
+
+              Example:
+                python agent_bench.py agent_run ./agent/repo ./my_experiment \\
+                    -c ./agent/mini_code.yaml -j 4 --n_samples 3 --timeout 1800
+
+  collect     Aggregate Verilog results into grouped JSONL files.
+              Scans a source directory for module subdirectories and collects
+              their .v files into JSONL format, grouped by project prefix.
+
+              Automatically detects testing directory structure (sample_* subdirs)
+              and collects from all samples with correct codeid.
+
+              Usage: python agent_bench.py collect --source <dir> --target <dir>
+
+  pretty_print  Display verification results in a readable table format.
+                Reads the JSON file generated by evaluate.py and prints
+                formatted statistics including Pass@1 and Pass@5 metrics.
+
+                Usage: python agent_bench.py pretty_print <results_json>
+
+Quick Start:
+    # 1. Generate benchmark tasks (normal mode with docs)
+    python agent_bench.py mk_bench --target ./agent/repo
+
+    # 1b. Generate benchmark tasks (pure code mode)
+    python agent_bench.py mk_bench --target ./agent/repo --pure-code
+
+    # 2. Evaluate model directly (base reasoning)
+    python agent_bench.py run ./agent/repo ./samples/Qwen3.5 \
+        --model "openai/Qwen3.5-35B-A3B" \
+        --base_url "http://localhost:30000/v1" \
+        --workers 2 --n_samples 8
+
+    # 3. Evaluate agent workflow (uses mini-swe-agent)
+    python agent_bench.py agent_run ./agent/repo ./my_experiment \
+        -c ./agent/mini_code.yaml -j 4 --n_samples 3
+
+    # 4. Collect results (auto-detects sample_* subdirs)
+    python agent_bench.py collect --source ./my_experiment --target ./samples/NAME
+
+    # 5. Verify solutions
+    python evaluate.py --solution_name NAME --task_level module --num_samples 1
+
+    # 6. View results
+    python agent_bench.py pretty_print results/NAME_module_results.json
+
+Environment Variables:
+    OPENAI_API_KEY         - API authentication key (if not using --api_key)
+"""
+
+import argparse
+import json
+import re
+import shutil
+import itertools
+import subprocess
+import numpy as np
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
+from tqdm import tqdm
+from litellm import completion
+
+MODULE_NAMES = [
+    "sd_bd", "sd_clock_divider", "sd_crc_16", "sd_crc_7", "sd_controller_wb", "sd_data_master",
+    "sd_cmd_master", "sd_rx_fifo", "sd_tx_fifo", "sd_fifo_rx_filler", "sd_fifo_tx_filler",
+    "sd_data_serial_host", "sd_cmd_serial_host", "sdc_controller", "aes_sbox", "aes_rcon",
+    "aes_inv_sbox", "aes_key_expand_128", "aes_cipher_top", "aes_inv_cipher_top", "e203_biu",
+    "e203_clk_ctrl", "e203_clkgate", "e203_core", "e203_cpu", "e203_cpu_top", "e203_dtcm_ctrl",
+    "e203_dtcm_ram", "e203_extend_csr", "e203_exu", "e203_exu_alu", "e203_exu_alu_bjp",
+    "e203_exu_alu_csrctrl", "e203_exu_alu_dpath", "e203_exu_alu_lsuagu", "e203_exu_alu_muldiv",
+    "e203_exu_alu_rglr", "e203_exu_branchslv", "e203_exu_commit", "e203_exu_csr", "e203_exu_decode",
+    "e203_exu_disp", "e203_exu_excp", "e203_exu_longpwbck", "e203_exu_nice", "e203_exu_oitf",
+    "e203_exu_regfile", "e203_exu_wbck", "e203_ifu", "e203_ifu_ifetch", "e203_ifu_ift2icb",
+    "e203_ifu_litebpu", "e203_ifu_minidec", "e203_irq_sync", "e203_itcm_ctrl", "e203_itcm_ram",
+    "e203_lsu", "e203_lsu_ctrl", "e203_reset_ctrl", "e203_srams"
+]
+SYSTEM_NAMES = ["sdc_controller", "aes_cipher_top", "aes_inv_cipher_top", "e203_cpu_top"]
+
+
+def estimate_pass_at_k(num_samples, num_correct, k):
+    """
+    Estimates pass@k of each problem and returns them in an array.
+    """
+    def estimator(n: int, c: int, k: int) -> float:
+        """
+        Calculates 1 - comb(n - c, k) / comb(n, k).
+        """
+        if n - c < k:
+            return 1.0
+        return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+
+    if isinstance(num_samples, int):
+        num_samples_it = itertools.repeat(num_samples, len(num_correct))
+    else:
+        assert len(num_samples) == len(num_correct)
+        num_samples_it = iter(num_samples)
+
+    return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
+
+
+def cmd_mk_bench(args):
+    """Generate benchmark tasks from project directories."""
+    # Configuration
+    projects = ['aes', 'e203_hbirdv2', 'sdc']
+    repo_base = Path(args.target)
+    pure_code = getattr(args, 'pure_code', False)
+
+    # 1. Create the base repository directory
+    repo_base.mkdir(parents=True, exist_ok=True)
+
+    for project_name in projects:
+        project_path = Path(project_name)
+        if not project_path.is_dir():
+            print(f"Skipping {project_name}: Directory not found.")
+            continue
+
+        print(f"Processing Project: {project_name}...")
+
+        # 2. Iterate through each module (subdirectory)
+        for module_path in project_path.iterdir():
+            if not module_path.is_dir():
+                continue
+            module_name = module_path.name
+            if module_name not in MODULE_NAMES and module_name not in SYSTEM_NAMES:
+                continue
+
+            module_dest = repo_base / module_name
+            module_dest.mkdir(parents=True, exist_ok=True)
+
+            # Collect verification files
+            verification_src = module_path / "verification"
+            if verification_src.exists():
+                for file in verification_src.iterdir():
+                    if file.is_file() and not file.name.startswith(module_name):
+                        shutil.copy2(file, module_dest / file.name)
+
+            if pure_code:
+                # Pure code mode: copy the module file, don't copy docs
+                module_file = module_path / f"{module_name}.v"
+                if module_file.exists():
+                    shutil.copy2(module_file, module_dest / f"{module_name}.v")
+            else:
+                # Normal mode: copy documentation, don't copy module file
+                doc_src = module_path / "doc"
+                if doc_src.exists():
+                    doc_dest = module_dest / "doc"
+                    doc_dest.mkdir(exist_ok=True)
+                    for md_file in doc_src.iterdir():
+                        if md_file.is_file():
+                            shutil.copy2(md_file, doc_dest / md_file.name)
+
+    print(f"\n[Success] Benchmark generated in {repo_base.resolve()}")
+
+
+def cmd_collect(args):
+    """Collect Verilog files into grouped JSONL files."""
+    groups = {
+        "aes": "aes.jsonl",
+        "sd": "sdc.jsonl",
+        "e203": "e203_hbirdv2.jsonl"
+    }
+
+    source_dir = Path(args.source)
+    target_dir = Path(args.target)
+
+    if not source_dir.exists():
+        print(f"Source not found: {source_dir}")
+        return
+
+    # Detect if source is a testing directory (contains sample_* subdirs)
+    sample_dirs = [d for d in source_dir.iterdir() if d.is_dir() and d.name.startswith("sample_")]
+    is_testing_dir = len(sample_dirs) > 0
+
+    results = {filename: [] for filename in groups.values()}
+
+    if is_testing_dir:
+        # Collect from testing directory structure: source/sample_*/module/
+        print(f"Detected testing directory with {len(sample_dirs)} samples")
+        for sample_dir in sample_dirs:
+            # Extract sample_id from sample_dir name (e.g., "sample_1" -> 1)
+            try:
+                sample_id = int(sample_dir.name.split("_")[1])
+            except (IndexError, ValueError):
+                print(f"Warning: Could not parse sample ID from {sample_dir.name}, skipping")
+                continue
+
+            for module_dir in sample_dir.iterdir():
+                if not module_dir.is_dir():
+                    continue
+
+                # Match prefix
+                target_file = next((f for p, f in groups.items() if module_dir.name.startswith(p)), None)
+                if not target_file:
+                    continue
+
+                v_path = module_dir / f"{module_dir.name}.v"
+                content = v_path.read_text(encoding='utf-8') if v_path.exists() else ""
+
+                results[target_file].append({
+                    "task": module_dir.name,
+                    "codeid": sample_id,
+                    "code": content
+                })
+    else:
+        # Collect from regular structure: source/module/
+        for dir_path in source_dir.iterdir():
+            if not dir_path.is_dir():
+                continue
+
+            # Match prefix
+            target_file = next((f for p, f in groups.items() if dir_path.name.startswith(p)), None)
+            if not target_file:
+                continue
+
+            v_path = dir_path / f"{dir_path.name}.v"
+            content = v_path.read_text(encoding='utf-8') if v_path.exists() else ""
+
+            results[target_file].append({
+                "task": dir_path.name,
+                "codeid": 1,
+                "code": content
+            })
+
+    target_dir.mkdir(parents=True, exist_ok=True)
+
+    for filename, entries in results.items():
+        if entries:
+            out_path = target_dir / filename
+            with open(out_path, 'w', encoding='utf-8') as f:
+                for entry in entries:
+                    f.write(json.dumps(entry) + '\n')
+            print(f"Saved {len(entries)} records to {out_path}")
+
+
+def cmd_pretty_print(args):
+    """Pretty print verification results from a JSON file."""
+    results_path = Path(args.results)
+
+    if not results_path.exists():
+        print(f"Error: Results file not found: {results_path}")
+        return
+
+    try:
+        with open(results_path, 'r') as f:
+            data = json.load(f)
+    except json.JSONDecodeError as e:
+        print(f"Error: Failed to parse JSON: {e}")
+        return
+
+    solution_name = data.get("solution_name", "unknown")
+    task_level = data.get("task_level", "unknown")
+    num_samples = data.get("num_samples", 1)
+    is_formal = data.get("is_formal", False)
+    overall_stats = data.get("overall_stats", {})
+    infos = data.get("per_module", {})
+
+    # Print header
+    print("\n" + "=" * 70)
+    print(f"RTL Verification Results: {solution_name}")
+    print("=" * 70)
+    print(f"Task Level: {task_level}")
+    print(f"Samples: {num_samples}")
+    print(f"Formal Verification: {'Yes' if is_formal else 'No'}")
+
+    # Print overall stats
+    print("\n" + "-" * 70)
+    print("Overall Statistics")
+    print("-" * 70)
+    print(f"{'Metric':<15} | {'Pass@1':<12} | {'Pass@5':<12}")
+    print("-" * 45)
+    print(f"{'Syntax':<15} | {overall_stats.get('syntax_1', 0):<12.2%} | {overall_stats.get('syntax_5', 0):<12.2%}")
+    print(f"{'Function':<15} | {overall_stats.get('function_1', 0):<12.2%} | {overall_stats.get('function_5', 0):<12.2%}")
+    if is_formal:
+        print(f"{'Formal':<15} | {overall_stats.get('formal_1', 0):<12.2%} | {overall_stats.get('formal_5', 0):<12.2%}")
+
+    # Print per-module stats
+    print("\n" + "-" * 70)
+    print("Per-Module Results")
+    print("-" * 70)
+    header = f"{'Module/System Name':<35} | {'Syntax@1':<10} | {'Function@1':<10}"
+    if is_formal:
+        header += f" | {'Formal@1':<10}"
+    print(header)
+    print("-" * (75 if is_formal else 60))
+
+    for name, counts in sorted(infos.items()):
+        s1 = estimate_pass_at_k(num_samples, [counts[0]], 1)[0]
+        f1 = estimate_pass_at_k(num_samples, [counts[1]], 1)[0]
+
+        if is_formal:
+            fm1 = estimate_pass_at_k(num_samples, [counts[2]], 1)[0]
+            print(f"{name:<35} | {s1:<10.2%} | {f1:<10.2%} | {fm1:<10.2%}")
+        else:
+            print(f"{name:<35} | {s1:<10.2%} | {f1:<10.2%}")
+
+    print("-" * (75 if is_formal else 60))
+    print()
+
+
+def run_mini_agent_task(task_tuple):
+    """Run mini-swe-agent for a single task (module_name, sample_id).
+
+    Returns:
+        (module_name, sample_id, success, message)
+    """
+    module_dir, module_name, sample_id, timeout, config_path = task_tuple
+
+    # Check if doc path exists
+    doc_path = module_dir / "doc" / f"{module_name}.md"
+    if not doc_path.exists():
+        return module_name, sample_id, False, f"Doc not found: {doc_path}"
+
+    # Build the task prompt
+    task = f'''Task: Implement the Verilog module {module_name} and a SystemVerilog testbench.
+
+Requirements:
+- Specifications: Follow `doc/{module_name}.md`.
+- RTL: Module name must be `{module_name}`. Save as `{module_name}.v`.
+- Testbench: Module name must be `tb`. Save as `{module_name}_tb.sv`.
+- Compiler: Verilator. Code must be synthesizable.
+
+Workflow:
+1. Read repository files and Makefile.
+2. Write the RTL and testbench code following `doc/{module_name}.md`.
+3. Run "make all" to compile and simulate.
+4. If errors occur, fix the code and repeat step 3 until successful.'''
+
+    # Build the mini command
+    # Use run.traj.json as the trajectory file name
+    # Use config from work_dir root (absolute path)
+    cmd = [
+        "mini",
+        "-o", "run.traj.json",
+        "-c", str(config_path),
+        "--exit-immediately",
+        "-t", task,
+    ]
+
+    try:
+        result = subprocess.run(
+            cmd,
+            cwd=module_dir,
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+        )
+
+        if result.returncode == 0:
+            return module_name, sample_id, True, f"Success {module_name} sample {sample_id}"
+        else:
+            return module_name, sample_id, False, f"Failed {module_name} sample {sample_id}: {result.stderr[-200:]}"
+
+    except subprocess.TimeoutExpired:
+        return module_name, sample_id, False, f"Timeout {module_name} sample {sample_id} after {timeout}s"
+    except Exception as e:
+        return module_name, sample_id, False, f"Error {module_name} sample {sample_id}: {e}"
+
+
+def eval_agent(args):
+    """Run agent benchmark using mini-swe-agent with isolated working directories."""
+    repo_dir = Path(args.repo)
+    config_path = Path(args.config)
+    work_dir = Path(args.work_dir)
+
+    if not repo_dir.exists():
+        print(f"Error: Repository directory not found: {repo_dir}")
+        return
+
+    if not config_path.exists():
+        print(f"Error: Config file not found: {config_path}")
+        return
+
+    # Find all module directories (subdirectories with doc/ or .v files)
+    module_dirs = [d for d in repo_dir.iterdir() if d.is_dir() and ((d / "doc").exists() or (d / f"{d.name}.v").exists())]
+    print(f"Found {len(module_dirs)} module directories")
+
+    # Step 1: Create working directories for each sample
+    # Structure: work_dir/sample_{i}/ where i is 1..n_samples
+    print(f"\nSetting up working directories in {work_dir}...")
+    work_dir.mkdir(parents=True, exist_ok=True)
+
+    # Copy config to work_dir root as a backup (for saving experiment config)
+    # Use absolute path so subprocess can find it regardless of cwd
+    config_backup = (work_dir / "config.yaml").resolve()
+    if not config_backup.exists():
+        shutil.copy(config_path, config_backup)
+        print(f"  Copied config to {config_backup}")
+
+    for sample_id in range(1, args.n_samples + 1):
+        sample_dir = work_dir / f"sample_{sample_id}"
+
+        # Check if already exists (for resume)
+        if args.resume and sample_dir.exists():
+            print(f"  Sample {sample_id}: directory exists, skipping copy")
+            continue
+
+        # Copy entire repo to sample directory
+        if sample_dir.exists():
+            shutil.rmtree(sample_dir)
+        shutil.copytree(repo_dir, sample_dir)
+        print(f"  Copied repo to {sample_dir}")
+
+    print(f"Created {args.n_samples} working directories")
+
+    # Step 2: Create all tasks (sample_dir, module_name, sample_id, config_path) tuples
+    # Each sample_id gets its own isolated testing directory
+    all_tasks = []
+    for sample_id in range(1, args.n_samples + 1):
+        sample_dir = work_dir / f"sample_{sample_id}"
+        for module_dir in module_dirs:
+            module_name = module_dir.name
+            module_path_in_sample = sample_dir / module_name
+            all_tasks.append((module_path_in_sample, module_name, sample_id, args.timeout, config_backup))
+
+    print(f"Total tasks to process: {len(all_tasks)}")
+
+    # Step 3: If resume, filter completed tasks (check if run.traj.json exists in module dir)
+    pending_tasks = []
+    if args.resume:
+        for module_path_in_sample, module_name, sample_id, timeout, config_path in all_tasks:
+            traj_file = module_path_in_sample / "run.traj.json"
+            if traj_file.exists():
+                continue
+            pending_tasks.append((module_path_in_sample, module_name, sample_id, timeout, config_path))
+        print(f"Resuming: {len(pending_tasks)} pending tasks (skipped {len(all_tasks) - len(pending_tasks)} completed)")
+    else:
+        pending_tasks = all_tasks
+
+    if not pending_tasks:
+        print("All tasks already completed!")
+        return
+
+    # Step 4: Process tasks in parallel with progress bar
+    results_summary = {"success": 0, "error": 0}
+
+    with ProcessPoolExecutor(max_workers=args.workers) as executor:
+        futures = {executor.submit(run_mini_agent_task, task): task for task in pending_tasks}
+
+        for future in tqdm(as_completed(futures), total=len(pending_tasks), desc="Processing agent tasks"):
+            try:
+                module_name, sample_id, success, message = future.result()
+                if success:
+                    results_summary["success"] += 1
+                    print(f"[PASS] {message}")
+                else:
+                    results_summary["error"] += 1
+                    print(f"[FAIL] {message}")
+            except Exception as e:
+                task = futures[future]
+                results_summary["error"] += 1
+                print(f"[ERROR] {task[1]} sample {task[2]}: {e}")
+
+    print("\n========================================")
+    print("Agent benchmark execution completed")
+    print(f"Results: {results_summary['success']} success, {results_summary['error']} errors")
+    print(f"Trajectory files saved in: {work_dir}/sample_*/**/run.traj.json")
+
+
+def extract_verilog_code(response: str) -> str:
+    """Extract Verilog code from markdown code blocks."""
+    # Match ```verilog ... ``` or ``` ... ``` blocks
+    patterns = [
+        r'```verilog\s*(.*?)\s*```',
+        r'```\s*(.*?)\s*```',
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, response, re.DOTALL)
+        if match:
+            return match.group(1).strip()
+    return ""
+
+
+def eval_model(args):
+    """Run agent benchmark using litellm to call LLM APIs."""
+    repo_dir = Path(args.repo)
+    output_dir = Path(args.output)
+
+    # Check if output directory exists and has files (when not resuming)
+    if not args.resume and output_dir.exists():
+        existing_files = list(output_dir.glob("*.jsonl"))
+        if existing_files:
+            print(f"Error: Output directory '{output_dir}' already exists with JSONL files.")
+            print("Use --resume to append to existing files, or specify a different output directory.")
+            return
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Find all module directories (subdirectories with doc/ or .v files)
+    module_dirs = [d for d in repo_dir.iterdir() if d.is_dir() and ((d / "doc").exists() or (d / f"{d.name}.v").exists())]
+    print(f"Found {len(module_dirs)} module directories")
+
+    # Determine group for each module
+    def get_group_file(module_name: str) -> Path:
+        if module_name.startswith("aes"):
+            return output_dir / "aes.jsonl"
+        elif module_name.startswith("sd"):
+            return output_dir / "sdc.jsonl"
+        elif module_name.startswith("e203"):
+            return output_dir / "e203_hbirdv2.jsonl"
+        else:
+            return output_dir / "other.jsonl"
+
+    # Step 1: Create all tasks (module_name, sample_id pairs)
+    all_tasks = []
+    for module_dir in module_dirs:
+        module_name = module_dir.name
+        for sample_id in range(1, args.n_samples + 1):
+            all_tasks.append((module_name, sample_id, module_dir))
+
+    print(f"Total tasks to process: {len(all_tasks)}")
+
+    # Step 2: If resume, read existing JSONL files and filter completed tasks
+    completed_tasks = set()  # Set of (module_name, sample_id) tuples
+    if args.resume:
+        group_files = ["aes.jsonl", "sdc.jsonl", "e203_hbirdv2.jsonl", "other.jsonl"]
+        for filename in group_files:
+            filepath = output_dir / filename
+            if filepath.exists():
+                try:
+                    with open(filepath, 'r', encoding='utf-8') as f:
+                        for line in f:
+                            if line.strip():
+                                try:
+                                    entry = json.loads(line)
+                                    task_key = (entry.get("task"), entry.get("codeid"))
+                                    completed_tasks.add(task_key)
+                                except json.JSONDecodeError:
+                                    continue
+                except Exception as e:
+                    print(f"Warning: Failed to read {filepath}: {e}")
+
+        print(f"Found {len(completed_tasks)} completed tasks from previous runs")
+
+    # Filter out completed tasks
+    pending_tasks = [
+        (module_name, sample_id, module_dir)
+        for module_name, sample_id, module_dir in all_tasks
+        if (module_name, sample_id) not in completed_tasks
+    ]
+
+    print(f"Pending tasks to process: {len(pending_tasks)}")
+
+    if not pending_tasks:
+        print("All tasks already completed!")
+        return
+
+    SYSTEM_PROMPT = (
+        "You are an expert Verilog engineer. Your task is to implement hardware modules based on specifications.\n\n"
+        "IMPORTANT: Your response must follow this exact format:\n\n"
+        "```verilog\n"
+        "// Your complete Verilog module implementation here\n"
+        "```"
+    )
+
+    # Cache for doc content to avoid re-reading files
+    doc_cache = {}
+
+    def process_single_task(task):
+        """Process a single task (module_name, sample_id)."""
+        module_name, sample_id, module_dir = task
+        group_file = get_group_file(module_name)
+
+        # Read doc content (with caching)
+        if module_name not in doc_cache:
+            doc_file = module_dir / "doc" / f"{module_name}.md"
+            if doc_file.exists():
+                try:
+                    doc_cache[module_name] = doc_file.read_text(encoding='utf-8')
+                except Exception as e:
+                    print(f"[ERROR] {module_name} sample {sample_id}: Failed to read doc: {e}")
+                    return None
+            else:
+                print(f"[ERROR] {module_name} sample {sample_id}: Cannot find doc")
+                return None
+
+        doc_content = doc_cache[module_name]
+
+        # Build messages
+        messages = [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": doc_content}
+        ]
+
+        try:
+            # Determine thinking mode (enabled by default)
+            thinking_enabled = not args.disable_thinking
+
+            # Prepare extra_body for thinking control
+            # Only pass enable_thinking when we need to disable it
+            extra_body = None
+            if args.disable_thinking:
+                extra_body = {"chat_template_kwargs": {"enable_thinking": False}}
+
+            # Call litellm completion
+            response = completion(
+                model=args.model,
+                messages=messages,
+                max_tokens=8192,
+                api_base=args.base_url,
+                api_key=args.api_key,
+                extra_body=extra_body,
+                timeout=args.timeout
+            )
+
+            # Extract response content
+            response_content = response.choices[0].message.content if response.choices else ""
+
+            # If thinking is enabled, split by </think> to get the actual response
+            if thinking_enabled and "</think>" in response_content:
+                response_content = response_content.split("</think>")[-1].strip()
+
+            # Extract pure Verilog code
+            extracted_code = extract_verilog_code(response_content)
+
+            result = {
+                "task": module_name,
+                "codeid": sample_id,
+                "code": extracted_code,
+                "raw_response": response_content,
+                "model": args.model,
+                "thinking_enabled": thinking_enabled
+            }
+
+            # Save immediately (append mode) - only save successful results
+            with open(group_file, 'a', encoding='utf-8') as f:
+                f.write(json.dumps(result) + '\n')
+
+            return result
+
+        except Exception as e:
+            # Print error but don't write to result file (to not break resume logic)
+            print(f"[ERROR] {module_name} sample {sample_id}: {e}")
+            return None
+
+    # Step 3 & 4: Process tasks in parallel with progress bar, save immediately
+    results_summary = {"success": 0, "error": 0}
+
+    with ThreadPoolExecutor(max_workers=args.workers) as executor:
+        futures = {executor.submit(process_single_task, task): task for task in pending_tasks}
+
+        for future in tqdm(as_completed(futures), total=len(pending_tasks), desc="Processing tasks"):
+            try:
+                result = future.result()
+                if result is None:
+                    results_summary["error"] += 1
+                else:
+                    results_summary["success"] += 1
+            except Exception as e:
+                task = futures[future]
+                results_summary["error"] += 1
+                print(f"[ERROR] {task[0]} sample {task[1]}: {e}")
+
+    print("\n========================================")
+    print("Benchmark execution completed")
+    print(f"Results: {results_summary['success']} success, {results_summary['error']} errors")
+    print(f"Results saved in: {output_dir}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Benchmark management tool for RTL agent evaluation."
+    )
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    # mk_bench subcommand
+    mk_bench_parser = subparsers.add_parser(
+        "mk_bench",
+        help="Generate benchmark tasks from project directories"
+    )
+    mk_bench_parser.add_argument(
+        "--target",
+        required=True,
+        help="Target directory to generate benchmark tasks"
+    )
+    mk_bench_parser.add_argument(
+        "--pure-code",
+        action="store_true",
+        help="Pure code mode: copy module Verilog files instead of documentation"
+    )
+    mk_bench_parser.set_defaults(func=cmd_mk_bench)
+
+    # collect subcommand
+    collect_parser = subparsers.add_parser(
+        "collect",
+        help="Collect Verilog files into grouped JSONL files"
+    )
+    collect_parser.add_argument(
+        "--source",
+        required=True,
+        help="Source directory containing subdirectories"
+    )
+    collect_parser.add_argument(
+        "--target",
+        required=True,
+        help="Target directory to save .jsonl files"
+    )
+    collect_parser.set_defaults(func=cmd_collect)
+
+    # run subcommand
+    run_parser = subparsers.add_parser(
+        "run",
+        help="Run agent benchmark using litellm to call LLM APIs"
+    )
+    run_parser.add_argument(
+        "repo",
+        help="Path to the benchmark repository (e.g., ./agent/repo)"
+    )
+    run_parser.add_argument(
+        "output",
+        help="Output directory for results (e.g., ./samples/Qwen3.5-35B-A3B)"
+    )
+    run_parser.add_argument(
+        "--model",
+        required=True,
+        help="Model name to use (e.g., openai/Qwen3.5-35B-A3B)"
+    )
+    run_parser.add_argument(
+        "--base_url",
+        default="http://localhost:8000/v1",
+        help="API base URL (default: http://localhost:8000/v1)"
+    )
+    run_parser.add_argument(
+        "--api_key",
+        default=None,
+        help="API key for authentication"
+    )
+    run_parser.add_argument(
+        "--workers",
+        type=int,
+        default=1,
+        help="Number of parallel workers (default: 1)"
+    )
+    run_parser.add_argument(
+        "--n_samples",
+        type=int,
+        default=1,
+        help="Number of samples to generate per module (default: 1)"
+    )
+    run_parser.add_argument(
+        "--disable_thinking",
+        action="store_true",
+        help="Disable thinking mode (enabled by default)"
+    )
+    run_parser.add_argument(
+        "--timeout",
+        type=int,
+        default=1800,
+        help="Request timeout in seconds (default: 1800)"
+    )
+    run_parser.add_argument(
+        "--resume",
+        action="store_true",
+        help="Resume from existing output, append to existing files"
+    )
+    run_parser.set_defaults(func=eval_model)
+
+    # agent_run subcommand
+    agent_run_parser = subparsers.add_parser(
+        "agent_run",
+        help="Run agent benchmark using mini-swe-agent"
+    )
+    agent_run_parser.add_argument(
+        "repo",
+        help="Path to the benchmark repository (e.g., ./agent/repo)"
+    )
+    agent_run_parser.add_argument(
+        "work_dir",
+        help="Working directory for this run (e.g., ./my_experiment). Will contain sample_*/ subdirs"
+    )
+    agent_run_parser.add_argument(
+        "-c", "--config",
+        type=Path,
+        default=Path("/workspace/S/nanziyuan/src/test/tmp/mini_code.yaml"),
+        help="Path to mini_code.yaml config file (default: /workspace/S/nanziyuan/src/test/tmp/mini_code.yaml)"
+    )
+    agent_run_parser.add_argument(
+        "-j", "--workers",
+        type=int,
+        default=4,
+        help="Number of parallel workers (default: 4)"
+    )
+    agent_run_parser.add_argument(
+        "--n_samples",
+        type=int,
+        default=1,
+        help="Number of isolated copies to create (default: 1)"
+    )
+    agent_run_parser.add_argument(
+        "--timeout",
+        type=int,
+        default=1800,
+        help="Timeout per task in seconds (default: 1800 = 30min)"
+    )
+    agent_run_parser.add_argument(
+        "--resume",
+        action="store_true",
+        help="Resume from existing trajectories, skip completed tasks"
+    )
+    agent_run_parser.set_defaults(func=eval_agent)
+
+    # pretty_print subcommand
+    pretty_print_parser = subparsers.add_parser(
+        "pretty_print",
+        help="Pretty print verification results from a JSON file"
+    )
+    pretty_print_parser.add_argument(
+        "results",
+        help="Path to the results JSON file (e.g., results/solution_module_results.json)"
+    )
+    pretty_print_parser.set_defaults(func=cmd_pretty_print)
+
+    args = parser.parse_args()
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()
--- a/mini_spec.yaml
+++ b/mini_spec.yaml
@@ -2,50 +2,13 @@ agent:
  system_template: |
    You are a hardware design documentation expert and a helpful assistant that can interact with a computer to analyze Verilog/SystemVerilog source code and generate production-quality design specifications.

-    Your response must contain exactly ONE bash code block with ONE command (or commands connected with && or ||).
-    Include a THOUGHT section before your command where you explain your reasoning process.
-    Format your response as shown in <format_example>.
-
-    <format_example>
-    THOUGHT: Your reasoning and analysis here. Explain why you want to perform the action.
-
-    ```mswea_bash_command
-    your_command_here
-    ```
-    </format_example>
-
-    Failure to follow these rules will cause your response to be rejected.
-
-    <critical_rule>
-    **NEVER use triple backticks (```) inside your bash command block.**
-    The parser extracts your command by matching the opening and closing ``` fences. Any ``` inside the block will cause a parsing/extraction error and your action will fail.
-
-    When writing Markdown files that need code blocks, you MUST use **4-space indentation** for code blocks instead of fenced code blocks.
-
-    Example — WRONG (will break):
-    ```mswea_bash_command
-    cat <<'EOF' > spec.md
-    ```
-    diagram here
-    ```
-    EOF
-    ```
-
-    Example — CORRECT (using 4-space indentation):
-    ```mswea_bash_command
-    cat <<'EOF' > spec.md
-    ## Block Diagram
-
-        [Input] -> [Block A] -> [Output]
-            |
-            v
-        [Control]
-
-    EOF
-    ```
-    </critical_rule>
-
    ---
+    ## Writing Style
+
+    - Use present tense ("The module performs...", not "The module will perform...")
+    - Be precise and specific: say "4-bit counter" not "a counter"; avoid "various", "several", or "etc."
+    - Every claim must be directly traceable to the RTL source — do not speculate about design intent
+    - Use consistent terminology throughout the document

    ## Design Specification Format Reference

@@ -144,313 +107,172 @@ agent:
    - **Resources**: Concrete resource needs (e.g., "Requires 16 S-box modules", "4×4 state matrix register array")
    - Do NOT speculate on gate counts or technology-specific numbers unless evident from the RTL

-    ---
-
-    <example_spec>
-    # aes_cipher_top
-
-    ## 1. Introduction
-
-    The aes_cipher_top module is the core control module of the entire AES encryption system, responsible for coordinating and controlling the entire encryption process. This module implements all round transformations of the AES encryption algorithm, including SubBytes, ShiftRows, MixColumns, and AddRoundKey.
-
-    ## 2. Block Diagram
-
-    ```
-                        +---------------------+
-        key[127:0] ---->| aes_key_expand_128  |----> w0,w1,w2,w3 (round keys)
-                        +---------------------+            |
-                                                           v
-        text_in[127:0] ---> [XOR w/ key] ---> [State Matrix sa[4][4]]
-                                                    |
-                                              +-----v------+
-                                              |  Round Loop | (10 rounds)
-                                              |  +--------+ |
-                                              |  |SubBytes | | <-- 16x aes_sbox
-                                              |  +--------+ |
-                                              |  |ShiftRows| |
-                                              |  +--------+ |
-                                              |  |MixCols  | | (skip in final round)
-                                              |  +--------+ |
-                                              |  |AddRndKey| |
-                                              |  +--------+ |
-                                              +------+------+
-                                                     |
-                                                     v
-                                              text_out[127:0], done
-    ```
-
-    ## 3. Interface
-
-    | Signal Name | Direction | Width | Description |
-    |-------------|-----------|-------|-------------|
-    | clk | input | 1 | Clock signal |
-    | rst | input | 1 | Reset signal |
-    | ld | input | 1 | Load enable |
-    | done | output | 1 | Encryption complete |
-    | key | input | 128 | Input key |
-    | text_in | input | 128 | Input plaintext |
-    | text_out | output | 128 | Output ciphertext |
-
-    ## 4. Registers
-
-    | Register Name | Width | Type | Reset Value | Description |
-    |---------------|-------|------|-------------|-------------|
-    | text_in_r | 128 | Data Buffer | 0 | Temporary storage register for input text during encryption process |
-    | sa[0:3][0:3] | 8 | State Matrix | 0 | 4×4 state matrix registers for storing and processing intermediate encryption results, arranged in column-major order |
-    | dcnt | 4 | Control | 0 | Round counter register for tracking encryption progress, initial value 11 (0xB) for 10 rounds plus initial round |
-    | ld_r | 1 | Control | 0 | Load operation flag register, synchronizes data loading with clock cycle |
-    | text_out | 128 | Output Buffer | 0 | Output register storing the final ciphertext result after complete encryption process |
-
-    ## 5. Operation Principle
-
-    ### 5.1 Basic Principle
-
-    AES encryption algorithm, based on the state matrix concept, performs 10 rounds of transformation on 128-bit input data. Each round includes:
+  instance_template: |
+    ## Task

-    1. SubBytes - Non-linear byte substitution
-    2. ShiftRows - Row shifting operation
-    3. MixColumns - Column mixing operation (except final round)
-    4. AddRoundKey - Round key addition
+    {{task}}

-    ### 5.2 Mathematical Principle
+    You can execute bash commands to explore the codebase, read Verilog files, and generate the final design specification.

-    1. State Matrix Structure
+    ---

-    $$\begin{matrix}
-    sa00 & sa01 & sa02 & sa03 \\
-    sa10 & sa11 & sa12 & sa13 \\
-    sa20 & sa21 & sa22 & sa23 \\
-    sa30 & sa31 & sa32 & sa33
-    \end{matrix}$$
+    ## Command Execution Rules

-    - 4×4 byte matrix
-    - Column-major order mapping
-    - Each element is 8-bit data
+    You are operating in an environment where

-    2. Round Transformations
+    1. You issue at least one command
+    2. The system executes the command(s) in a subshell
+    3. You see the result(s)
+    4. You write your next command(s)

-    a) **SubBytes**: Non-linear transformation through S-box
-    - Independent byte transformation
-    - 16 bytes processed in parallel
+    Each response should include:

-    b) **ShiftRows**: Cyclic left shift
+    1. **Reasoning text** where you explain your analysis and plan
+    2. At least one tool call with your command

-    Row 0: [a b c d] → [a b c d]  // No shift
-    Row 1: [a b c d] → [b c d a]  // Shift 1 byte
-    Row 2: [a b c d] → [c d a b]  // Shift 2 bytes
-    Row 3: [a b c d] → [d a b c]  // Shift 3 bytes
+    **CRITICAL REQUIREMENTS:**

-    c) **MixColumns**: Column mixing
+    - Your response SHOULD include reasoning text explaining what you're doing
+    - Your response MUST include AT LEAST ONE bash tool call
+    - Directory or environment variable changes are not persistent. Every action is executed in a new subshell.
+    - However, you can prefix any action with `MY_ENV_VAR=MY_VALUE cd /path/to/working/dir && ...` or write/load environment variables from files
+    - Submit your changes and finish your work by issuing the following command: `echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT`.
+      Do not combine it with any other command. <important>After this command, you cannot continue working on this task.</important>

-    $$\begin{bmatrix}
-    02 & 03 & 01 & 01 \\
-    01 & 02 & 03 & 01 \\
-    01 & 01 & 02 & 03 \\
-    03 & 01 & 01 & 02
-    \end{bmatrix} \times \begin{bmatrix} s0 \\ s1 \\ s2 \\ s3 \end{bmatrix} = \begin{bmatrix} out0 \\ out1 \\ out2 \\ out3 \end{bmatrix}$$
+    Example of a CORRECT response:
+    <example_response>
+    I need to understand the structure of the repository first. Let me check what files are in the current directory to get a better understanding of the codebase.

-    - Operations in GF($2^{8}$)
-    - ×2 requires special finite field multiplication
-    - ×3 equals (×2)⊕(×1)
+    [Makes bash tool call with {"command": "ls -la"} as arguments]
+    </example_response>

-    d) **AddRoundKey**: Round key XOR
-    - Byte-wise XOR with round key
-    - Round key generated by key expansion module
+    <system_information>
+    {{system}} {{release}} {{version}} {{machine}}
+    </system_information>

-    ## 6. Implementation Details
+    ## Useful command examples

-    ### 6.1 Data Flow
+    ### Create a new file:

+    ```bash
+    cat <<'EOF' > newfile.py
+    import numpy as np
+    hello = "world"
+    print(hello)
+    EOF
    ```
-    text_in -> XOR(key) -> sa[][] -> SubBytes -> ShiftRows -> MixColumns -> AddRoundKey -> sa[][]
-                                        ^                                        |
-                                        |________ (repeat 9 rounds) ____________|
-                                                  (final round: skip MixColumns)
-                                                           |
-                                                           v
-                                                       text_out
-    ```
-
-    ### 6.2 State Machine
-
-    - **IDLE**: Initial reset state, waiting for load signal ld=1.
-    - **INIT_ROUND**: Loads input data into text_in_r. Performs initial round key addition. Sets round counter dcnt=0xB (11).
-    - **ROUND_OP**: Performs standard round operations (Rounds 1-9).
-    - **FINAL_ROUND**: Executes the final round (Round 10). Does not include MixColumns.
-    - **DONE**: Encryption complete. Sets done signal. Holds final result at text_out.
-
-    ### 6.3 Data Loading Operation

-    1. Input Data Mapping:
-       - 128-bit input mapped to state matrix in column-major order
-       - Fill matrix columns from MSB to LSB
+    ### Edit files with sed:

-    2. Initial Round Key Addition:
-       - Byte-wise XOR of input data with initial round key
-       - Synchronous loading to state matrix registers
-
-    ### 6.4 Core Operation Implementation
-
-    1. **SubBytes**: 16 parallel S-box units, all bytes simultaneously, pure combinational
-    2. **ShiftRows**: Hardwired row shifting, independent cyclic left shifts per row
-    3. **MixColumns**: Four columns in parallel, matrix multiplication in GF(2^8)
-    4. **AddRoundKey**: Byte-wise XOR, results stored directly in state matrix registers
-
-    ### 6.5 Control Logic
-
-    - 4-bit counter, initial value 11, decrements until completion
-    - Load signal controls initial data input
-    - One round per clock cycle, synchronous updates
-
-    ### 6.6 Output Generation
-
-    1. Skip MixColumns in final round
-    2. Complete ShiftRows
-    3. XOR with final round key
-    4. Organize output in column-major order
-    5. Generate completion signal
-
-    ## 7. Submodules
-
-    ### 7.1 aes_key_expand_128
-
-    #### 7.1.1 Description
-    Responsible for expanding a 128-bit initial key into round keys for multiple encryption rounds.
-
-    #### 7.1.2 Interface
-
-    | Signal Name | Direction | Width | Description |
-    |-------------|-----------|-------|-------------|
-    | clk | input | 1 | Clock signal |
-    | kld | input | 1 | Key load enable |
-    | key | input | 128 | Input initial key |
-    | wo_0 | output | 32 | Output round key word 0 |
-    | wo_1 | output | 32 | Output round key word 1 |
-    | wo_2 | output | 32 | Output round key word 2 |
-    | wo_3 | output | 32 | Output round key word 3 |
+    {%- if system == "Darwin" -%}
+    <important>
+    You are on MacOS. For all the below examples, you need to use `sed -i ''` instead of `sed -i`.
+    </important>
+    {%- endif -%}

-    ### 7.2 aes_sbox
+    ```bash
+    # Replace all occurrences
+    sed -i 's/old_string/new_string/g' filename.py

-    #### 7.2.1 Description
-    Performs non-linear byte substitution operations in the AES encryption algorithm.
+    # Replace only first occurrence
+    sed -i 's/old_string/new_string/' filename.py

-    #### 7.2.2 Interface
+    # Replace first occurrence on line 1
+    sed -i '1s/old_string/new_string/' filename.py

-    | Signal Name | Direction | Width | Description |
-    |-------------|-----------|-------|-------------|
-    | a | input | 8 | Input byte |
-    | b | output | 8 | Substituted byte |
+    # Replace all occurrences in lines 1-10
+    sed -i '1,10s/old_string/new_string/g' filename.py
+    ```

-    ## 8. Corner Cases
+    ### View file content:

-    1. Timing Related:
-       - Initial state after reset
-       - State transition during new data loading
+    ```bash
+    # View specific lines with numbers
+    nl -ba filename.py | sed -n '10,20p'
+    ```

-    2. Data Loading:
-       - Data synchronization during load
-       - Data stability assurance
+    ### Any other command you want to run

-    3. Completion Check:
-       - Output control after final round
-       - Proper done signal generation
+    ```bash
+    anything
+    ```
+ 
+    ## Recommended Workflow

-    ## 9. Constraints
+    This workflows should be done step-by-step so that you can iterate on your changes and any possible problems.

-    1. Timing Constraints:
-       - Register updates on clock rising edge
-       - Critical path includes S-box lookup and column mixing
+    ### Step 1: Explore Repository and Locate Target Module

-    2. Resource Constraints:
-       - Requires 16 S-box modules
-       - 4×4 state matrix register array
-       - Round counter and control logic
-    </example_spec>
+    ```bash
+    find . -type f \( -name "*.v" -o -name "*.sv" -o -name "*.vh" -o -name "*.svh" \) | sort
+    ```

-  instance_template: |
-    ## Task
+    > If the target file exceeds ~1000 lines, read it in sections using `sed -n 'START,ENDp'`.

-    {{task}}
+    ### Step 2: Read the Target Module and Its Dependencies

-    You can execute bash commands to explore the codebase, read Verilog files, and generate the final design specification.
+    ```bash
+    cat -n path/to/module.v
+    ```

-    ---
+    Then check for includes/defines:

-    ## Recommended Workflow
+    ```bash
+    grep -n -C 2 '`include\|`define' path/to/module.v
+    ```

-    Follow these steps **one at a time**, issuing one command per response so you can inspect results and iterate.
+    ### Step 3: Understand How the Module Is Used

-    ### Step 1: Locate the Target Module
-    Find the Verilog source file specified in the task. Explore the repository structure.
+    Search the project for where the target module is instantiated — reveals its system role, input sources, and output consumers.

    ```bash
-    find . -type f -name "*.v" -o -name "*.sv" | head -50
-    ls -la path/to/module.v
+    grep -rn -C 5 'module_name' . --include="*.v" --include="*.sv" | grep -v '^\s*//'
    ```

-    ### Step 2: Read the Target Module Source
-    Read the full source code of the target module carefully.
+    ### Step 4: Identify Submodules
+
+    Build a pattern file of all module names in the project:

    ```bash
-    cat -n path/to/module.v
+    grep -rh '^\s*module\s\+' . --include="*.v" --include="*.sv" \
+      | sed 's/^\s*module\s\+//;s/[#( ].*//' | sort -u > /tmp/all_modules.txt \
+      && cat /tmp/all_modules.txt
    ```

-    ### Step 3: Identify Submodule Instantiations
-    Search for all submodule instantiations inside the target module and locate their source files.
+    Then match against the target file to find instantiations:

    ```bash
-    # Find instantiations
-    grep -n '\b\w\+\s\+\w\+\s*(' path/to/module.v | grep -v '^\s*//' | grep -v 'always\|assign\|if\|else\|case\|for\|while\|function\|task\|begin\|end'
-
-    # Search for submodule definitions across the codebase
-    grep -rn 'module\s\+sub_module_name' . --include="*.v" --include="*.sv"
+    grep -n -w -C 3 -f /tmp/all_modules.txt path/to/module.v
    ```

-    ### Step 4: Read Submodule Sources
-    Read each submodule to understand the hierarchy and document their interfaces.
+    Then read each submodule source (one per step):

    ```bash
    cat -n path/to/submodule.v
    ```

-    ### Step 5: Analyze Signals, Registers, and Control Logic
-    Use grep to trace key signals and understand the design.
-
-    ```bash
-    # Find all register declarations (clocked blocks)
-    grep -n 'always_ff\|always @(posedge\|always @(negedge' path/to/module.v
-
-    # Find all combinational blocks
-    grep -n 'always_comb\|always @(\*)' path/to/module.v
+    > If a submodule's source cannot be found, infer the interface from the instantiation ports and note it in Section 7.

-    # Find parameter/localparam definitions (FSM states, constants)
-    grep -n 'parameter\|localparam' path/to/module.v
+    ### Step 5: Analyze Architecture

-    # Trace a specific signal
-    grep -n 'signal_name' path/to/module.v
+    Use `-C`/`-A`/`-B` flags for context around matches.

-    # Find state machine patterns
-    grep -n 'case\|endcase\|FSM\|state\|IDLE\|DONE' path/to/module.v
+    ```bash
+    grep -n -A 5 'always_ff\|always @\s*(posedge\|always @\s*(negedge' path/to/module.v
+    grep -n -A 5 'always_comb\|always @\s*(\*)' path/to/module.v
+    grep -n -C 1 'parameter\|localparam' path/to/module.v
+    grep -n -C 3 '\bcase\b\|\bstate\b\|IDLE\|DONE' path/to/module.v
    ```

-    ### Step 6: Check for Headers, Packages, and Includes
-    Find any included files that affect the module behavior.
+    Trace specific signals as needed:

    ```bash
-    grep -n '`include\|`define' path/to/module.v
-    find . -name "*.vh" -o -name "*.svh" | xargs grep -l 'relevant_term'
+    grep -n -C 3 'signal_name' path/to/module.v
    ```

-    ### Step 7: Generate the Specification
-    Write the complete spec to a Markdown file following the exact format in your system instructions (Sections 1–9).
+    ### Step 6: Generate the Specification

-    Key reminders when writing:
-    - **Section 4 (Registers)**: Use functional type categories (Data Buffer, Control, Counter, State Matrix, Output Buffer, Flag), not "Sequential"/"Combinatorial"
-    - **Section 5 (Operation Principle)**: Include mathematical formulas using LaTeX ($...$, $$...$$). Show matrices, field operations, shift patterns concretely.
-    - **Section 6 (Implementation Details)**: This is the largest section. Include sub-sections for data flow, state machine (list every state and what it does), data loading, core operations, control logic, and output generation.
-    - **Section 7 (Submodules)**: Give each submodule its own numbered sub-section with Description and Interface table.
-    - **Section 8 (Corner Cases)**: Group by category, use concise bullets.
-    - **Section 9 (Constraints)**: Be practical — mention critical paths and concrete resource needs. Do not guess technology-specific numbers.
+    Write the complete spec following Sections 1–9 from the system instructions.

    ```bash
    cat <<'EOF' > module_name_spec.md
@@ -460,94 +282,49 @@ agent:
    EOF
    ```

-    ### Step 8: Review
-    Read back the spec to verify completeness.
+    ### Step 7: Review and Validate

    ```bash
    cat -n module_name_spec.md
    ```

-    ### Step 9: Submit
-    ```bash
-    echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT
-    ```
+    **Validation checklist:**

-    ---
-
-    ## Useful Grep Patterns for Verilog Analysis
-
-    | Purpose | Command |
-    |---------|---------|
-    | Find module declaration | `grep -n '^module\b' file.v` |
-    | Find all ports | `grep -n 'input\|output\|inout' file.v` |
-    | Find registers | `grep -n '\breg\b\|\blogic\b' file.v` |
-    | Find wires | `grep -n '\bwire\b' file.v` |
-    | Find FSM states/params | `grep -n 'localparam\|parameter' file.v` |
-    | Find clock/reset | `grep -n 'posedge\|negedge\|rst\|reset' file.v` |
-    | Find assigns | `grep -n '^\s*assign\b' file.v` |
-    | Find generate blocks | `grep -n 'generate\|genvar' file.v` |
-    | Search project for module | `grep -rn 'module\s\+name' . --include="*.v" --include="*.sv"` |
-    | Find case statements | `grep -n '\bcase\b\|endcase' file.v` |
-
-    ---
+    | # | Check |
+    |---|-------|
+    | 1 | Every RTL port appears in Section 3.2 |
+    | 2 | Every `parameter`/`localparam` is in Section 3.1 |
+    | 3 | Every registered signal is in Section 4 |
+    | 4 | Every submodule instantiation has a Section 7.N entry |
+    | 5 | FSM states in Section 6.2 match all states in RTL |
+    | 6 | No section is empty or contains placeholder text |

-    ## Important Rules
+    > If any check fails, fix the spec and re-validate before submitting.

-    1. Every response must contain exactly one bash command action.
-    2. The action must be enclosed in triple backticks with `mswea_bash_command`.
-    3. Directory or environment variable changes are not persistent. Every action is executed in a new subshell.
-       You can prefix any action with `cd /path/to/dir && ...`.
-    4. **Do NOT generate the spec from memory or assumptions.** Always read the actual source code first.
-    5. **Trace ALL submodules** — each submodule needs its own sub-section with a full interface table.
-    6. The output spec file should be named `<module_name>_spec.md`.
-    7. The spec must cover ALL 9 sections with the formatting from the system instructions.
+    ### Step 8: Submit

-    <system_information>
-    {{system}} {{release}} {{version}} {{machine}}
-    </system_information>
-
-    ## Formatting your response
-
-    <example_response>
-    THOUGHT: I need to find the target Verilog file first. Let me explore the repository structure.
-
-    ```mswea_bash_command
-    find . -type f \( -name "*.v" -o -name "*.sv" \) | head -30
-    ```
-    </example_response>
-
-    <example_response>
-    THOUGHT: Found the file at ./rtl/aes_cipher_top.v. Let me read its full source to understand the module.
-
-    ```mswea_bash_command
-    cat -n ./rtl/aes_cipher_top.v
-    ```
-    </example_response>
-
-    <example_response>
-    THOUGHT: The module instantiates aes_key_expand_128 and aes_sbox. Let me find their source files to document the submodule hierarchy with full interface tables.
-
-    ```mswea_bash_command
-    grep -rn 'module\s\+aes_key_expand_128\|module\s\+aes_sbox' . --include="*.v" --include="*.sv"
+    ```bash
+    echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT
    ```
-    </example_response>
-
-    ## File editing helpers
-
-    ### Create a new file:

-    ```mswea_bash_command
-    cat <<'EOF' > newfile.md
-    content here
-    EOF
-    ```
+    ---

-    ### View file content:
+    ## Quick-Reference: Grep Patterns for Verilog Analysis

-    ```mswea_bash_command
-    nl -ba filename.v | sed -n '10,20p'
-    ```
+    > Use `-C n` for context, `-A n` for after, `-B n` for before.

+    | Purpose | Command |
+    |---------|---------|
+    | Module declaration | `grep -n '^\s*module\b' file.v` |
+    | All ports | `grep -n '\binput\b\|\boutput\b\|\binout\b' file.v` |
+    | Parameters / localparams | `grep -n -C 1 '\bparameter\b\|\blocalparam\b' file.v` |
+    | Sequential blocks | `grep -n -A 5 'always_ff\|always @\s*(posedge' file.v` |
+    | Combinational blocks | `grep -n -A 5 'always_comb\|always @\s*(\*)' file.v` |
+    | FSM / case blocks | `grep -n -C 3 '\bcase\b\|\bcasez\b\|\bcasex\b' file.v` |
+    | Assign statements | `grep -n '^\s*assign\b' file.v` |
+    | Generate blocks | `grep -n -C 2 '\bgenerate\b\|\bgenvar\b' file.v` |
+    | Find where module is used | `grep -rn -C 5 'module_name' . --include="*.v" --include="*.sv"` |
+    | Trace a signal | `grep -n -C 3 '\bsignal_name\b' file.v` |
  step_limit: 30
  cost_limit: 3.0
  mode: yolo
@@ -560,68 +337,46 @@ environment:
    TQDM_DISABLE: '1'
 model:
  observation_template: |
-    {% if output.exception_info -%}
-    <exception>{{output.exception_info}}</exception>
-    {% endif -%}
-    <returncode>{{output.returncode}}</returncode>
-    {% if output.output | length < 10000 -%}
-    <output>
-    {{ output.output -}}
-    </output>
+    {%- if output.output | length < 10000 -%}
+    {
+      "returncode": {{ output.returncode }},
+      "output": {{ output.output | tojson }}
+      {%- if output.exception_info %}, "exception_info": {{ output.exception_info | tojson }}{% endif %}
+    }
    {%- else -%}
-    <warning>
-    The output of your last command was too long.
-    Please try a different command that produces less output.
-    If you're looking at a file you can try use head, tail or sed to view a smaller number of lines selectively.
-    If you're using grep or find and it produced too much output, you can use a more selective search pattern.
-    If you really need to see something from the full command's output, you can redirect output to a file and then search in that file.
-    </warning>
-    {%- set elided_chars = output.output | length - 10000 -%}
-    <output_head>
-    {{ output.output[:5000] }}
-    </output_head>
-    <elided_chars>
-    {{ elided_chars }} characters elided
-    </elided_chars>
-    <output_tail>
-    {{ output.output[-5000:] }}
-    </output_tail>
+    {
+      "returncode": {{ output.returncode }},
+      "output_head": {{ output.output[:5000] | tojson }},
+      "output_tail": {{ output.output[-5000:] | tojson }},
+      "elided_chars": {{ output.output | length - 10000 }},
+      "warning": "Output too long."
+      {%- if output.exception_info %}, "exception_info": {{ output.exception_info | tojson }}{% endif %}
+    }
    {%- endif -%}
  format_error_template: |
-    Format error:
+    Tool call error:

    <error>
    {{error}}
    </error>

-    Here is general guidance on how to format your response:
+    Here is general guidance on how to submit correct toolcalls:

-    Please always provide EXACTLY ONE action in triple backticks, found {{actions|length}} actions.
+    Every response needs to use the 'bash' tool at least once to execute commands.

-    Please format your action in triple backticks as shown in <response_example>.
+    Call the bash tool with your command as the argument:
+    - Tool: bash
+    - Arguments: {"command": "your_command_here"}

-    <response_example>
-    Here are some thoughts about why you want to perform the action.
-
-    ```mswea_bash_command
-    <action>
-    ```
-    </response_example>
-
-    If you have completed your assignment, please consult the first message about how to
-    submit your solution (you will not be able to continue working on this task after that).
+    If you want to end the task, please issue the following command: `echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT`
+    without any other command.
  cost_tracking: "ignore_errors"
-  # model_name: "openai/Qwen3.5-35B-A3B"
  model_name: "openai/ep-20260305183120-jg8dt"
-  model_class: litellm_textbased
-  action_regex: "```mswea_bash_command\\s*\\n(.*?)\\n```"
+  model_class: litellm
  model_kwargs:
    custom_llm_provider: "openai"
    api_base: "https://ark.cn-beijing.volces.com/api/v3"
-    # api_base: "http://0.0.0.0:30000/v1"
    extra_body:
-      chat_template_kwargs:
-        enable_thinking:
-          false
-          # drop_params: true
-
+      max_tokens: 32768
+      thinking:
+        type: enabled
--- a/readme.md
+++ b/readme.md
@@ -54,3 +54,128 @@ mini -t "Generate specification for {{code_path}} and save it at {{spec_path}}."
 ```

 agent轨迹和一些运行中的信息会被保存到run.traj.json。建议保存，可以用于后续训练。
+
+## RealBench
+
+先根据https://github.com/IPRC-DIP/RealBench的Readme，配置环境并解压缩。
+具体使用方法在`agent_bench.py` 开头的文档。
+
+```python
+#!/usr/bin/env python3
+"""
+agent_bench.py - Benchmark management tool for RTL agent evaluation.
+
+This script manages the full workflow for evaluating RTL generation agents:
+1. Generate benchmark tasks (mk_bench)
+2. Evaluate model directly (run) - Calls LLM API, extracts Verilog code from response
+3. Evaluate agent workflow (agent_run) - Uses mini-swe-agent framework
+4. Collect results (collect)
+5. Verify solutions (evaluate.py)
+6. Pretty print results (pretty_print)
+
+Subcommands:
+
+  mk_bench    Generate benchmark tasks from project directories.
+              Creates a repository structure with verification files for each
+              module found in the projects.
+
+              Normal mode: Copies documentation to doc/ subdirectory.
+              Pure code mode (--pure-code): Copies the module's Verilog file
+              instead of documentation (for code-to-code tasks).
+
+              Usage: python agent_bench.py mk_bench --target <dir> [--pure-code]
+
+  run         Test base reasoning model using litellm.
+              Directly calls the LLM API with task prompts to evaluate the
+              base model's ability without agent workflow.
+
+              The model is instructed to wrap Verilog code in ```verilog blocks.
+              The script extracts the pure code for the "code" field and saves
+              the raw response in the "raw_response" field.
+
+              Usage: python agent_bench.py run <repo> [output] --model <model> [options]
+
+              Options:
+                --base_url <url>       API base URL (default: http://localhost:8000/v1)
+                --api_key <key>        API key for authentication
+                --workers <n>          Number of parallel workers (default: 1)
+                --n_samples <n>        Samples per module (default: 1)
+                --disable_thinking     Disable thinking mode (enabled by default)
+                --timeout <secs>       Request timeout in seconds (default: 1800)
+
+              Example:
+                python agent_bench.py run ./agent/repo ./samples/Qwen3.5 \\
+                    --model "openai/Qwen3.5-35B-A3B" \\
+                    --base_url "http://localhost:30000/v1" \\
+                    --workers 2 --n_samples 8
+
+  agent_run   Run agent benchmark using mini-swe-agent.
+              Uses the mini-swe-agent framework to solve tasks with tool use
+              and iterative refinement. Each sample runs in an isolated
+              working directory to prevent file conflicts.
+
+              For N samples, creates N isolated copies (work_dir/sample_i/)
+              of the repository. Each copy runs independently.
+
+              Trajectory files (run.traj.json) are saved in each module
+              directory for later analysis.
+
+              Config is copied to work_dir/config.yaml as a backup.
+
+              Usage: python agent_bench.py agent_run <repo> <work_dir> [options]
+
+              Options:
+                -c, --config <path>    Path to mini_code.yaml config file
+                -j, --workers <n>      Number of parallel workers (default: 4)
+                --n_samples <n>        Number of isolated samples (default: 1)
+                --timeout <secs>       Timeout per task in seconds (default: 1800)
+                --resume               Skip samples with existing trajectory files
+
+              Example:
+                python agent_bench.py agent_run ./agent/repo ./my_experiment \\
+                    -c ./agent/mini_code.yaml -j 4 --n_samples 3 --timeout 1800
+
+  collect     Aggregate Verilog results into grouped JSONL files.
+              Scans a source directory for module subdirectories and collects
+              their .v files into JSONL format, grouped by project prefix.
+
+              Automatically detects testing directory structure (sample_* subdirs)
+              and collects from all samples with correct codeid.
+
+              Usage: python agent_bench.py collect --source <dir> --target <dir>
+
+  pretty_print  Display verification results in a readable table format.
+                Reads the JSON file generated by evaluate.py and prints
+                formatted statistics including Pass@1 and Pass@5 metrics.
+
+                Usage: python agent_bench.py pretty_print <results_json>
+
+Quick Start:
+    # 1. Generate benchmark tasks (normal mode with docs)
+    python agent_bench.py mk_bench --target ./agent/repo
+
+    # 1b. Generate benchmark tasks (pure code mode)
+    python agent_bench.py mk_bench --target ./agent/repo --pure-code
+
+    # 2. Evaluate model directly (base reasoning)
+    python agent_bench.py run ./agent/repo ./samples/Qwen3.5 \
+        --model "openai/Qwen3.5-35B-A3B" \
+        --base_url "http://localhost:30000/v1" \
+        --workers 2 --n_samples 8
+
+    # 3. Evaluate agent workflow (uses mini-swe-agent)
+    python agent_bench.py agent_run ./agent/repo ./my_experiment \
+        -c ./agent/mini_code.yaml -j 4 --n_samples 3
+
+    # 4. Collect results (auto-detects sample_* subdirs)
+    python agent_bench.py collect --source ./my_experiment --target ./samples/NAME
+
+    # 5. Verify solutions
+    python evaluate.py --solution_name NAME --task_level module --num_samples 1
+
+    # 6. View results
+    python agent_bench.py pretty_print results/NAME_module_results.json
+
+Environment Variables:
+    OPENAI_API_KEY         - API authentication key (if not using --api_key)
+```