Implement a code execution verifier for programming benchmarks. Given a candidate code solution and a list of test cases (input/expected output pairs), execute the code safely and verify correctness.
from typing import Dict, List, Tuple
import sys
import io
def code_execution_verifier(
code: str,
test_cases: List[Tuple[str, str]],
function_name: str = "solution",
timeout: float = 5.0
) -> Dict:
results = []
passed = 0
try:
namespace = {}
exec(code, namespace)
except Exception as e:
return {
"compile_error": str(e),
"passed": 0,
"total": len(test_cases),
"results": []
}
func = namespace.get(function_name)
if func is None:
return {
"compile_error": f"Function '{function_name}' not found",
"passed": 0,
"total": len(test_cases),
"results": []
}
for inp, expected in test_cases:
old_stdout = sys.stdout
sys.stdout = io.StringIO()
try:
args = eval(inp)
if not isinstance(args, tuple):
args = (args,)
result = func(*args)
actual = repr(result)
stdout_output = sys.stdout.getvalue()
except Exception as e:
actual = None
stdout_output = ""
results.append({
"input": inp,
"expected": expected,
"actual": f"ERROR: {e}",
"passed": False
})
continue
finally:
sys.stdout = old_stdout
test_passed = str(result) == expected or actual == expected
if test_passed:
passed += 1
results.append({
"input": inp,
"expected": expected,
"actual": actual,
"passed": test_passed
})
return {
"passed": passed,
"total": len(test_cases),
"pass_rate": round(passed / len(test_cases), 4) if test_cases else 0,
"results": results
}