[文档]classACEProcessAccuracy(MetricBase):"""The ace benchmark process accuracy metric."""
[文档]def__init__(self,mile_stone:list[str],)->None:"""Initialize the AceBench process accuracy metric."""super().__init__(name="process_accuracy",metric_type=MetricType.NUMERICAL,description="The AceBench Agent eval process accuracy metric.",)self.mile_stone=mile_stone
[文档]def__call__(self,solution:SolutionOutput,)->MetricResult:"""Calculate the metric result."""# Turn the tool use block sequence into ACEBench format# e.g. func(arg1='dfd', arg2=44)gathered_trajectory=[]fortool_callinsolution.trajectory:iftool_call.get("type")=="tool_use":function_name=tool_call.get("name")kwargs=tool_call.get("input")gathered_kwargs=[]forkey,valueinkwargs.items():ifisinstance(value,str):gathered_kwargs.append(f"{key}='{value}'",)else:gathered_kwargs.append(f"{key}={value}",)kwargs_str=", ".join(gathered_kwargs)gathered_trajectory.append(f"[{function_name}({kwargs_str})]",)forstoneinself.mile_stone:ifstonenotingathered_trajectory:returnMetricResult(name=self.name,result=0,message=f"Error: Missing milestone '{stone}' in ""the given trajectory.",)returnMetricResult(name=self.name,result=1,message="Success",)
[文档]def__init__(self,state:list[dict],)->None:"""Initialize the _metric object."""super().__init__("accuracy",MetricType.NUMERICAL,"The AceBench Agent eval accuracy metric.",)self.state=state
[文档]def__call__(self,solution:SolutionOutput,)->MetricResult:"""Calculate the metric result."""# Check if the solution matches the ground truthifnotisinstance(solution.output,list):raiseValueError("Ground truth state must be a list.")# Handle the typos in ACEBench datasetgathered_state={}foriteminself.state:forkey,valueinitem.items():ifkey.endswith("API"):key=key.replace("API","Api")elifkey.endswith("rpi"):key=key.replace("pi","Api")gathered_state[key]=valuegathered_output={}foriteminsolution.output:forkey,valueinitem.items():gathered_output[key]=valueifnotset(gathered_state.keys()).issubset(gathered_output.keys()):raiseValueError("Missing keys in solution output compared to state, "f"ground truth keys: {gathered_state.keys()}, "f"solution keys: {gathered_output.keys()}",)forkey,valueingathered_state.items():ifvalue!=gathered_output.get(key):returnMetricResult(name=self.name,result=0,message=(f"Error: Mismatch in key '{key}':"f"\n{value}\n{gathered_output.get(key)}"),)returnMetricResult(name=self.name,result=1,message="Success: All keys match",)