step1: sample code

251bf975 · nzy · 251bf975 · 251bf975 · 251bf975 · 251bf975
Commit 251bf975 authored Sep 26, 2024 by nzy
Showing with 1047 additions and 0 deletions

.gitignore
+165 -0

example_config.toml
+15 -0

readme.qmd
+46 -0

step1_apps_test.py
+529 -0

step1_evaluate_code.py
+105 -0

step1_mk_prompt.py
+68 -0

step1_sample_code.py
+11 -0

utils.py
+50 -0

utils_vllm.py
+58 -0

No files found.
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+readme.pdf
\ No newline at end of file
--- a/example_config.toml
+++ b/example_config.toml
+model = "/path/to/model"
+apps = "/path/to/apps_dataset"
+
+[sample]
+sample_prompt_path = "path"
+sample_result_path = "path"
+
+[sample.sampling_params]
+n = 0
+temperature = 0.0
+max_new_tokens = 512
+
+[evaluate]
+evaluate_result_path = ""
\ No newline at end of file
--- a/readme.qmd
+++ b/readme.qmd
+---
+title: "Code Critic"
+format: typst
+---
+
+## Abstract
+
+LLM-based code verifiers are essential for improving the accuracy of large language models (LLMs) in code generation at test time by filtering out incorrect solutions. While humans can pinpoint bugs and judge the correctness by reasoning through the code, current code verifiers can only make end-to-end binary judgments, which limits their capability to properly verify the code.
+
+In this paper, we present COVER, a novel approach that  improves the code verifiers’ ability by integrating them with error localization and reasoning. During training, COVER leverages LLMs to generate diverse code samples and assess their correctness through test cases. It then automatically produces detailed error annotations by comparing incorrect code samples with their most similar correct versions. This comparison enables the LLMs to generate explanatory annotations that highlight both the location and the nature of the errors. During inference, the verifier trained on this enriched dataset—evaluates multiple candidate solutions, not only determining their correctness but also reasoning about potential bugs, ultimately selecting the most accurate solution.
+
+Our experimental results demonstrate that COVER significantly outperforms existing methods. It achieves a test accuracy improvement of X% on [specific benchmark name] and Y% on [another benchmark name], representing a substantial advancement in code verification for LLMs.
+
+
+## Methods
+
+### Step1 Sample & Evaluate
+
+### Step2 Prepare preference code pairs
+
+## Environment
+
+Same as Llama-factory (Recommand Version)
+
+| Mandatory    | Minimum | Recommend |
+| ------------ | ------- | --------- |
+| python       | 3.8     | 3.11      |
+| torch        | 1.13.1  | 2.4.0     |
+| transformers | 4.41.2  | 4.43.4    |
+| datasets     | 2.16.0  | 2.20.0    |
+| accelerate   | 0.30.1  | 0.32.0    |
+| peft         | 0.11.1  | 0.12.0    |
+| trl          | 0.8.6   | 0.9.6     |
+
+| Optional     | Minimum | Recommend |
+| ------------ | ------- | --------- |
+| CUDA         | 11.6    | 12.2      |
+| deepspeed    | 0.10.0  | 0.14.0    |
+| vllm         | 0.4.3   | 0.5.0     |
+
+And ***[Difftastic](https://github.com/Wilfred/difftastic)***
+
+```bash
+$ cargo install --locked difftastic
+```
\ No newline at end of file
--- a/step1_apps_test.py
+++ b/step1_apps_test.py
+# copy from codeparrot/apps_metric/testing_util.py
+# https://huggingface.co/spaces/codeparrot/apps_metric/blob/main/testing_util.py
+
+import json
+import sys
+import faulthandler
+import platform
+
+# used for debugging to time steps
+from datetime import datetime
+
+# to run the solution files we're using a timing based approach
+import signal
+
+import numpy as np
+# for capturing the stdout
+from io import StringIO
+# used for testing the code that reads from input
+from unittest.mock import patch, mock_open
+
+from pyext import RuntimeModule
+
+from enum import Enum
+class CODE_TYPE(Enum):
+    call_based = 0
+    standard_input = 1
+
+# stuff for setting up signal timer
+class TimeoutException(Exception):
+    pass
+def timeout_handler(signum, frame):
+    print("alarm went off")
+    #return
+    raise TimeoutException
+signal.signal(signal.SIGALRM, timeout_handler)
+timeout = 4  # seconds
+
+# used to capture stdout as a list
+# from https://stackoverflow.com/a/16571630/6416660
+# alternative use redirect_stdout() from contextlib
+class Capturing(list):
+    def __enter__(self):
+        self._stdout = sys.stdout
+        sys.stdout = self._stringio = StringIO()
+        # Make closing the StringIO a no-op
+        self._stringio.close = lambda x: 1
+        return self
+    def __exit__(self, *args):
+        self.extend(self._stringio.getvalue().splitlines())
+        del self._stringio    # free up some memory
+        sys.stdout = self._stdout
+
+
+def run_test(sample, test=None, debug=False):
+    """
+    if test(generated_code) is not None it'll try to run the code.
+    otherwise it'll just return an input and output pair.
+    """
+    # Disable functionalities that can make destructive changes to the test.
+    reliability_guard()
+    
+    if debug:
+        print(f"start = {datetime.now().time()}")
+
+    try:
+        in_outs = json.loads(sample["input_output"])
+    except ValueError:
+        in_outs = None
+    if in_outs:
+        if in_outs.get("fn_name") is None:
+            which_type = CODE_TYPE.standard_input  # Standard input
+            method_name = None
+        else:
+            which_type = CODE_TYPE.call_based  # Call-based
+            method_name = in_outs["fn_name"]
+
+    if debug:
+        print(f"loaded input_output = {datetime.now().time()}")
+ 
+    if test is None:
+        return in_outs
+    elif test is not None:
+        results = []
+        sol = "import sys\nimport time\nimport itertools\nfrom itertools import accumulate, product, permutations, combinations\nimport collections\nfrom collections import Counter, OrderedDict, deque, defaultdict, ChainMap\nfrom functools import lru_cache\nimport math\nfrom math import sqrt, sin, cos, tan, ceil, fabs, floor, gcd, exp, log, log2\nimport fractions\nfrom typing import List, Tuple\nimport numpy as np\nimport random\nimport heapq\nfrom heapq import *\n"
+        if debug:
+            print(f"loading test code = {datetime.now().time()}")
+ 
+        if which_type == CODE_TYPE.call_based:
+            sol += test
+            if debug:
+                print(f"sol = {sol}")
+            signal.alarm(timeout)
+            try:
+                tmp_sol = RuntimeModule.from_string("tmp_sol", "", sol)
+                if "class Solution" not in test:
+                    tmp = tmp_sol
+                else:
+                    tmp = tmp_sol.Solution()
+                signal.alarm(0)
+            except Exception as e:
+                signal.alarm(0)
+                if debug:
+                     print(f"type 0 compilation error = {e}")
+                results.append(-2)
+                return results
+            signal.alarm(0)
+
+        elif which_type == CODE_TYPE.standard_input:
+            # sol
+            tmp_test = test.split("\n")
+
+            new_test = []
+            for x in tmp_test:
+                if (not x.startswith("from ")) and (not x.startswith("import ")):
+                    new_test.append("\t" + x + "\n")
+                else:
+                    new_test.append(x + "\n")
+            tmp_test = new_test
+            
+            new_test = ""
+            started = False
+            for i in tmp_test:
+                if i.startswith("\t") and not started:
+                    new_test += "stdin = sys.stdin\nstdout = sys.stdout\n"
+                    new_test += "def code():\n"
+                    new_test += i
+                    started = True
+                elif started and ((i.startswith("from ")) or (i.startswith("import "))): 
+                    new_test += "\t" + i
+                else:
+                    new_test += i
+            tmp_test = new_test
+
+            sol += tmp_test
+            if debug:
+                print(f"sol = {sol}")
+            method_name = "code"
+            signal.alarm(timeout)
+            try:
+                tmp_sol = RuntimeModule.from_string("tmp_sol", "", sol)
+                tmp = tmp_sol
+                signal.alarm(0)
+            except Exception as e:
+                signal.alarm(0)
+                if debug:
+                    print(f"type 1 compilation error = {e}")
+                results.append(-2)
+                return results
+            signal.alarm(0)
+        if debug:
+            print(f"get method = {datetime.now().time()}")
+ 
+        try:
+            method = getattr(tmp, method_name)  # get_attr second arg must be str
+        except:
+            signal.alarm(0)
+            e = sys.exc_info()
+            print(f"unable to get function error = {e}")
+            results.append(-2)
+            return results
+
+        for index, inputs in enumerate(in_outs["inputs"]):
+            # JSON forces dictionaries to have string keys; this undoes this (assuming a singleton list)
+            try:
+                if isinstance(inputs[0], dict):
+                    inputs = [{int(k): v for k,v in inputs[0].items()}]
+            except:
+                True
+            try:
+                if isinstance(in_outs["outputs"][index], dict):
+                    in_outs["outputs"][index] = [{int(k): v for k,v in in_outs["outputs"][index].items()}]
+            except:
+                True
+            try:
+                if isinstance(in_outs["outputs"][index][0], dict):
+                    in_outs["outputs"][index] = [{int(k): v for k,v in in_outs["outputs"][index][0].items()}]
+            except:
+                True
+
+            if debug:
+                print(f"time: {datetime.now().time()} testing index = {index}  inputs = {inputs}, {type(inputs)}. type = {which_type}")
+            if which_type == CODE_TYPE.call_based:  # Call-based
+                signal.alarm(timeout)
+                faulthandler.enable()
+                try:
+                    output = method(*inputs)
+
+                    # ground truth sequences are not tuples
+                    if isinstance(output, tuple):
+                        output = list(output)
+                    
+                    tmp_result = output == in_outs["outputs"][index]
+                    if isinstance(in_outs["outputs"][index], list) and in_outs["outputs"][index]:
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index][0])
+
+                    # ground truth sequences are not tuples
+                    try:
+                        if isinstance(output[0], tuple):
+                            tmp_result = tmp_result or ([list(x) for x in output] == in_outs["outputs"][index][0])
+                    except:
+                        True
+                    results.append(tmp_result)
+
+                    # reset the alarm
+                    signal.alarm(0)
+                except Exception as e:
+                    signal.alarm(0)
+                    faulthandler.disable()
+                    if debug:
+                        print(f"Standard input runtime error or time limit exceeded error = {e}")
+                    results.append(-1)
+                    continue
+                faulthandler.disable()
+                signal.alarm(0)
+                if debug:
+                    print(f"outputs = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
+            elif which_type == CODE_TYPE.standard_input:  # Standard input
+                faulthandler.enable()
+                signal.alarm(timeout)
+                passed = False
+
+                if isinstance(inputs, list):
+                    inputs = "\n".join(inputs)
+                if isinstance(in_outs['outputs'][index], list):
+                    in_outs['outputs'][index] = "\n".join(in_outs['outputs'][index])
+
+                with Capturing() as output:
+                    try:
+                        call_method(method, inputs)
+                        # reset the alarm
+                        signal.alarm(0)
+                        passed = True
+                    except Exception as e:
+                        # runtime error or took too long
+                        signal.alarm(0)
+                        print(f"Call-based runtime error or time limit exceeded error = {repr(e)}{e}")
+                        results.append(-1)
+                    signal.alarm(0)
+
+                if not passed:
+                    if debug:
+                        nl = "\n"
+                        if not isinstance(inputs, list):
+                            print(f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
+                        else:
+                            print(f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
+                    continue
+
+                if passed and debug:
+                    print(f"==> output = {output}, test outputs = {in_outs['outputs'][index]}")
+
+                if custom_compare_(output, in_outs['outputs'][index]):
+                    tmp_result = True
+                    results.append(tmp_result)
+                    continue
+
+                # ground truth sequences are expressed as lists not tuples
+                if isinstance(output, tuple):
+                    output = list(output)
+
+                tmp_result = False
+                try:
+                    tmp_result = (output == [in_outs["outputs"][index]])
+                    if isinstance(in_outs["outputs"][index], list):
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index])
+                        if isinstance(output[0], str):
+                            tmp_result = tmp_result or ([e.strip() for e in output] == in_outs["outputs"][index])
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check1 exception = {e}")
+                    pass
+
+                if tmp_result == True:  
+                    results.append(tmp_result)
+                    continue
+
+                # try one more time without \n
+                if isinstance(in_outs["outputs"][index], list):
+                    for tmp_index, i in enumerate(in_outs["outputs"][index]):
+                        in_outs["outputs"][index][tmp_index] = i.split("\n")
+                        in_outs["outputs"][index][tmp_index] = [x.strip() for x in in_outs["outputs"][index][tmp_index] if x]
+                else:
+                    in_outs["outputs"][index] = in_outs["outputs"][index].split("\n")
+                    in_outs["outputs"][index] = list(filter(len, in_outs["outputs"][index]))
+                    in_outs["outputs"][index] = list(map(lambda x:x.strip(), in_outs["outputs"][index]))
+
+                try:
+                    tmp_result = (output == [in_outs["outputs"][index]])
+                    if isinstance(in_outs["outputs"][index], list):
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index])
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check2 exception = {e}")
+                    pass
+
+                if tmp_result == True:
+                    results.append(tmp_result)
+                    continue
+
+                # try by converting the output into a split up list too
+                if isinstance(output, list):
+                    output = list(filter(len, output))
+
+                if debug:
+                    nl = "\n"
+                    if not isinstance(inputs, list):
+                        print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}") 
+                    else:
+                        print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}") 
+                
+                if tmp_result == True:
+                    results.append(tmp_result)
+                    continue
+
+                try:
+                    tmp_result = (output == [in_outs["outputs"][index]])
+                    if isinstance(in_outs["outputs"][index], list):
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index])
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check3 exception = {e}")
+                    pass
+
+                try:
+                    output_float = [float(e) for e in output]
+                    gt_float = [float(e) for e in in_outs['outputs'][index]]
+                    tmp_result = tmp_result or ((len(output_float) == len(gt_float)) and np.allclose(output_float, gt_float))
+                except Exception as e:
+                    pass
+                try:
+                    if isinstance(output[0], list):
+                        output_float = [float(e) for e in output[0]]
+                        gt_float = [float(e) for e in in_outs['outputs'][index][0]]
+                        tmp_result = tmp_result or ((len(output_float) == len(gt_float)) and np.allclose(output_float, gt_float))
+                except Exception as e:
+                    pass
+
+                if tmp_result == True:
+                    results.append(tmp_result)
+                    continue
+
+                # try by converting the stuff into split up list
+                if isinstance(in_outs["outputs"][index], list):
+                    for tmp_index, i in enumerate(in_outs["outputs"][index]):
+                        in_outs["outputs"][index][tmp_index] = set(i.split())
+                else:
+                    in_outs["outputs"][index] = set(in_outs["outputs"][index].split())
+
+                try:
+                    tmp_result = (output == in_outs["outputs"][index])
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check4 exception = {e}")
+                    continue
+
+                if tmp_result == True:
+                    results.append(tmp_result)
+                    continue 
+
+                # try by converting the output into a split up list too
+                if isinstance(output, list):
+                    for tmp_index, i in enumerate(output):
+                        output[tmp_index] = i.split()
+                    output = list(filter(len, output))
+                    for tmp_index, i in enumerate(output):
+                        output[tmp_index] = set(i)    
+                else:
+                    output = output.split()
+                    output = list(filter(len, output))
+                    output = set(output)
+
+                try:
+                    tmp_result = (set(frozenset(s) for s in output) == set(frozenset(s) for s in in_outs["outputs"][index]))
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check5 exception = {e}")
+
+
+                # if they are all numbers, round so that similar numbers are treated as identical
+                try:
+                    tmp_result = tmp_result or (set(frozenset(round(float(t),3) for t in s) for s in output) ==\
+                        set(frozenset(round(float(t),3) for t in s) for s in in_outs["outputs"][index]))
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check6 exception = {e}")
+                
+                if tmp_result == True and debug:
+                    print("PASSED")
+ 
+                results.append(tmp_result)
+                
+                if debug:
+                    nl = "\n"
+                    if not isinstance(inputs, list):
+                        print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
+                    else:
+                        print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}") 
+
+
+    return results
+
+
+def custom_compare_(output, ground_truth):
+    
+    if isinstance(output, list):
+        output_1 = "\n".join(output)
+        if stripped_string_compare(output_1, ground_truth):
+            return True
+
+    if isinstance(output, list):
+        output_2 = [o.lstrip().rstrip() for o in output]
+        output_2 = "\n".join(output_2)
+        if stripped_string_compare(output_2, ground_truth):
+            return True
+
+    return False
+
+def stripped_string_compare(s1, s2):
+    s1 = s1.lstrip().rstrip()
+    s2 = s2.lstrip().rstrip()
+    return s1 == s2
+
+def call_method(method, inputs):
+
+    if isinstance(inputs, list):
+        inputs = "\n".join(inputs)
+
+    inputs_line_iterator = iter(inputs.split("\n"))
+
+    # sys.setrecursionlimit(10000)
+
+    # @patch('builtins.input', side_effect=inputs.split("\n"))
+    @patch('builtins.open', mock_open(read_data=inputs))
+    @patch('sys.stdin', StringIO(inputs))
+    @patch('sys.stdin.readline', lambda *args: next(inputs_line_iterator))
+    @patch('sys.stdin.readlines', lambda *args: inputs.split("\n"))
+    @patch('sys.stdin.read', lambda *args: inputs)
+    # @patch('sys.stdout.write', print)
+    def _inner_call_method(_method):
+        try:
+            return _method()
+        except SystemExit as e:
+            pass
+        finally:
+            pass
+    return _inner_call_method(method) 
+
+
+
+
+def reliability_guard(maximum_memory_bytes=None):
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+
+    if maximum_memory_bytes is not None:
+        import resource
+
+        resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
+        resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
+        if not platform.uname().system == "Darwin":
+            resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
+
+    faulthandler.disable()
+
+    import builtins
+
+    builtins.exit = None
+    builtins.quit = None
+
+    import os
+
+    os.environ["OMP_NUM_THREADS"] = "1"
+
+    os.kill = None
+    os.system = None
+    os.putenv = None
+    os.remove = None
+    os.removedirs = None
+    os.rmdir = None
+    os.fchdir = None
+    os.setuid = None
+    os.fork = None
+    os.forkpty = None
+    os.killpg = None
+    os.rename = None
+    os.renames = None
+    os.truncate = None
+    os.replace = None
+    os.unlink = None
+    os.fchmod = None
+    os.fchown = None
+    os.chmod = None
+    os.chown = None
+    os.chroot = None
+    os.fchdir = None
+    os.lchflags = None
+    os.lchmod = None
+    os.lchown = None
+    os.getcwd = None
+    os.chdir = None
+
+    import shutil
+
+    shutil.rmtree = None
+    shutil.move = None
+    shutil.chown = None
+
+    import subprocess
+
+    subprocess.Popen = None  # type: ignore
+
+    __builtins__["help"] = None
+
+    import sys
+
+    sys.modules["ipdb"] = None
+    sys.modules["joblib"] = None
+    sys.modules["resource"] = None
+    sys.modules["psutil"] = None
+    sys.modules["tkinter"] = None
\ No newline at end of file
--- a/step1_evaluate_code.py
+++ b/step1_evaluate_code.py
+# copy from codeparrot/apps_metric/utils.py
+# https://huggingface.co/spaces/codeparrot/apps_metric/blob/main/utils.py
+
+import json
+import multiprocessing
+import numpy as np
+from typing import Dict
+from datasets import load_dataset
+
+from tqdm import tqdm
+from tqdm.contrib.concurrent import process_map
+
+from step1_apps_test import run_test
+from utils import extract_code, read_config, load_jsonl, save_jsonl
+
+TIMEOUT = 10
+
+
+def check_correctness(sample, generation, timeout, debug=False):
+    """Check correctness of code generation with a global timeout.
+    The global timeout is to catch some extreme/rare cases not handled by the timeouts
+    inside `run_test`"""
+    def _temp_run(sample, generation, debug, result):
+        result.append(run_test(sample, test=generation, debug=debug))
+
+    manager = multiprocessing.Manager()
+    result = manager.list()
+    p = multiprocessing.Process(target=_temp_run, args=(sample, generation, debug, result))
+    p.start()
+    p.join(timeout=timeout + 1)
+    if p.is_alive():
+        p.kill()
+    if not result:
+        in_outs = json.loads(sample["input_output"])
+        # consider that all tests failed
+        result = [[-1 for i in range(len(in_outs["inputs"]))]]
+        if debug:
+            print(f"global timeout")
+    return result[0]
+
+
+def test_generation(args, debug=False):
+    apps_item, code_sample = args
+    message = code_sample["messages"][-1]
+    assert message["role"] == "assistant"
+    code = extract_code(message["content"])
+
+    curr_res = [-2]
+    try:
+        curr_res = check_correctness(apps_item, code, timeout=TIMEOUT, debug=debug)
+        if debug:
+            print(f"\nSuccessful compilation of task {code}!")
+        fixed = []
+        for e in curr_res:
+            if isinstance(e, np.ndarray):
+                e = e.item(0)
+            if isinstance(e, np.bool_):
+                e = bool(e)
+            fixed.append(e)
+        curr_res = fixed
+        if not np.all(curr_res):
+            if debug:
+                print(curr_res)
+                print(f"Results were not True for all test cases")
+    except Exception as e:
+        if debug:
+            print(f"Compilation failed, test framework exception = {repr(e)}{e}\n")
+    finally:
+        assert isinstance(curr_res, list)
+        problem_results = np.asarray(curr_res)
+        code_sample["eval_result"] = np.all(problem_results > 0)
+
+    return code_sample
+
+
+def evaluate_code_samples(code_samples: list, dataset_path: str):
+    apps_eval = load_dataset(dataset_path)
+
+    def get_apps_item(item):
+        problem_id = item["problem_id"]
+        split, idx = problem_id.split('_')
+        # get corresponding samples from APPS dataset
+        return apps_eval[split][int(idx)]
+
+    args = [(get_apps_item(sample), sample) for sample in code_samples]
+
+    cpu_num = multiprocessing.cpu_count()
+    results = process_map(test_generation, args, max_workers=cpu_num)
+    return results
+
+
+def evaluate(code_sample_path, dataset_path, output_path):
+    code_samples = load_jsonl(code_sample_path)
+    results  = evaluate_code_samples(code_samples, dataset_path)
+    save_jsonl(results, output_path)
+
+
+if __name__ == "__main__":
+    cfg = read_config()
+    evaluate(
+        cfg["sample"]["sample_result_path"],
+        cfg["apps"],
+        cfg["evaluate"]["evaluate_result_path"],
+    )
\ No newline at end of file
--- a/step1_mk_prompt.py
+++ b/step1_mk_prompt.py
+from datasets import load_dataset
+import json
+from utils import read_config, save_jsonl
+from transformers import AutoTokenizer
+
+
+def mk_prompt(doc) -> str:
+    prompt = "Write Python code to solve competitive programming problems in a markdown code block."
+
+    starter_code = None if len(doc["starter_code"]) == 0 else doc["starter_code"]
+    try:
+        input_outpout = json.loads(doc["input_output"])
+        fn_name = None if not input_outpout.get("fn_name") else input_outpout["fn_name"]
+    except ValueError:
+        fn_name = None
+    prompt += "\nQUESTION:\n"
+    prompt += doc["question"]
+    if starter_code:
+        prompt += starter_code
+    if not fn_name:
+        prompt += "\nUse Standard Input format"
+    else:
+        prompt += "\nUse Call-Based format"
+
+    prompt += "\nPlease generate the code in a ```python markdown block, ensuring to include the closing ``` at the end."
+
+    conversation = [{"role": "user", "content": prompt}]
+    return conversation
+
+
+def mk_sample_prompt(model_path, apps_path, output_path):
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    prompts = []
+    for split in ["train", "test"]:
+        ds = load_dataset(apps_path, split=split)
+        for sample in ds:
+            problem_id = split + "_" + str(sample["problem_id"])
+
+            # Filter problems without input_output
+            try:
+                json.loads(sample["input_output"])
+            except ValueError:
+                print(f"Skipping {problem_id}: Invalid JSON in input_output")
+                continue
+
+            prompt = mk_prompt(sample)
+
+            # Filter long prompts
+            chat_text = tokenizer.apply_chat_template(prompt, tokenize=False)
+            tokenized_prompt = tokenizer.encode(chat_text)
+            if len(tokenized_prompt) > (4096 - 512):
+                print(
+                    f"Skipping {problem_id}: Token length {len(tokenized_prompt)} exceeds limit"
+                )
+                continue
+
+            prompts.append(dict(problem_id=problem_id, messages=prompt))
+
+    print(f"size of dataset: {len(prompts)}")
+    save_jsonl(prompts, output_path)
+
+
+if __name__ == "__main__":
+    cfg = read_config()
+    mk_sample_prompt(cfg["model"], cfg["apps"], cfg["sample"]["sample_prompt_path"])
\ No newline at end of file
--- a/step1_sample_code.py
+++ b/step1_sample_code.py
+from utils_vllm import vllm_inference
+from utils import read_config
+
+cfg = read_config()
+vllm_inference(
+    cfg["model"],
+    cfg["sample"]["sample_prompt_path"],
+    cfg["sample"]["sample_result_path"],
+    cfg["sample"]["sampling_params"]
+)
\ No newline at end of file
--- a/utils.py
+++ b/utils.py
+from pathlib import Path
+import json
+import os
+import re
+from codebleu import calc_codebleu
+import tomllib
+
+
+def load_jsonl(file_path):
+    with open(file_path, "r", encoding="utf-8") as f:
+        return [json.loads(line) for line in f]
+
+
+def load_json(file_path):
+    with open(file_path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def save_jsonl(data, file_path):
+    with open(file_path, "w", encoding="utf-8") as f:
+        for item in data:
+            f.write(json.dumps(item) + "\n")
+
+
+codeblock_pattern = re.compile(r"```python(.+?)```", flags=re.DOTALL)
+
+
+def extract_code(text: str):
+    codes = [match.strip() for match in re.findall(codeblock_pattern, text)]
+    if len(codes) > 0:
+        code = "\n".join(codes)
+        return code
+    else:
+        return ""
+
+
+def code_similarity(ref, pred):
+    return calc_codebleu([ref], [pred], lang="python", weights=(0, 0.5, 0.5, 0))
+
+
+def read_config():
+    import argparse
+
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("--config", type=str)
+    args = argparser.parse_args()
+
+    with open(args.config, "rb") as f:
+        return tomllib.load(f)
\ No newline at end of file
--- a/utils_vllm.py
+++ b/utils_vllm.py
+from vllm import LLM, SamplingParams
+
+import os
+import multiprocessing
+from functools import partial
+
+from utils import load_jsonl, save_jsonl
+
+
+def worker(cuda_device, prompts, model, sampling_params):
+    os.environ["CUDA_VISIBLE_DEVICES"] = cuda_device
+
+    llm = LLM(model=model, seed=42, max_model_len=8 * 1024)
+
+    tokenizer = llm.get_tokenizer()
+    stop_tokens = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
+    print(f"SUCCESS: load llm {model} on cuda {cuda_device}")
+
+    vllm_sampling_params = SamplingParams(
+        n=sampling_params['n'],
+        temperature=sampling_params['temperature'],
+        top_p=0.95,
+        max_tokens=sampling_params['max_tokens'],
+        stop_token_ids=stop_tokens
+    )
+
+    text_prompts = [tokenizer.apply_chat_template(item["messages"], tokenize=False, add_generation_prompt=True) for item in prompts]
+    outputs = llm.generate(text_prompts, sampling_params=vllm_sampling_params, use_tqdm=False)
+
+    for item, output in zip(prompts, outputs):
+        for response in output.outputs:
+            generated_text = response.text
+            item["messages"].append({"role": "assistant", "content": generated_text})
+
+    return True
+
+
+def vllm_inference(model_path, prompt_path, output_path, sampling_params):
+    prompts = load_jsonl(prompt_path)
+
+    # Respect the slurm's gpu allocation
+    cuda_devices = os.environ["CUDA_VISIBLE_DEVICES"].split(',')
+    gpu_num = len(cuda_devices)
+
+    # split data
+    sub_prompts = [[] for _ in range(gpu_num)]
+    for i, prompt in enumerate(prompts):
+        sub_prompts[i % gpu_num].append(prompt)
+
+    args = list(zip(cuda_devices, sub_prompts))
+    worker_llm = partial(worker, model_path=model_path, sampling_params=sampling_params)
+
+    with multiprocessing.Pool(gpu_num) as pool:
+        status = pool.starmap(worker_llm, args)
+    print(f"Execution Status: {all(status)}")
+
+    save_jsonl(prompts, output_path)
\ No newline at end of file