agentscope.evaluate._ace_benchmark._ace_benchmark 源代码

# -*- coding: utf-8 -*-
"""The ACE benchmark class in agentscope. The code is implemented with
reference to the `ACEBench <https://github.com/ACEBench/ACEBench>`_
under the MIT license."""
import json
import os
from typing import Generator

import json5
import requests
from tqdm import tqdm

from ._ace_metric import ACEAccuracy, ACEProcessAccuracy
from ._ace_tools_zh import ACEPhone
from .._benchmark_base import BenchmarkBase
from .._task import Task



[文档]
class ACEBenchmark(BenchmarkBase):
    """The ACE benchmark for evaluating AI agents."""

    data_dir_url: str = (
        "https://raw.githubusercontent.com/ACEBench/ACEBench/main/data_all"
    )
    """The URL to the data dir"""

    data_subdir: list[str] = [
        # "data_en",  # TODO: enable English version
        "data_zh",
    ]

    ground_truth_dir: str = "possible_answer"

    data_files: list[str] = [
        "data_agent_multi_step.json",
        "data_agent_multi_turn.json",
        # "data_normal_atom_bool.json",
        # "data_normal_atom_enum.json",
        # "data_normal_atom_list.json",
        # "data_normal_atom_number.json",
        # "data_normal_atom_object_deep.json",
        # "data_normal_atom_object_short.json",
        #
        # "data_normal_multi_turn_user_adjust.json",
        # "data_normal_multi_turn_user_switch.json",
        #
        # "data_normal_preference.json",
        # "data_normal_similar_api.json",
        # "data_normal_single_turn_parallel_function.json",
        # "data_normal_single_turn_single_function.json",
        #
        # "data_special_error_param.json",
        # "data_special_incomplete.json",
        # "data_special_irrelevant.json",
    ]
    """The data filenames"""


[文档]
    def __init__(
        self,
        data_dir: str,
    ) -> None:
        """Initialize the ACEBenchmark

        Args:
            data_dir (`str`):
                The directory where the dataset is downloaded and saved.
        """
        super().__init__(
            name="ACEBench",
            description="The ACE benchmark for evaluating AI agents.",
        )

        self.data_dir = os.path.abspath(data_dir)

        if os.path.exists(data_dir) and not os.path.isdir(data_dir):
            raise RuntimeError(
                f"The data_dir `{data_dir}` is not a valid directory path.",
            )

        os.makedirs(data_dir, exist_ok=True)

        if not self._verify_data():
            self._download_data()

        self.dataset = self._load_data()


    def _load_data(self) -> list[dict]:
        """Load the dataset from the data directory."""
        dataset = []
        for subdir in self.data_subdir:
            for filename in self.data_files:
                file_path = os.path.join(self.data_dir, subdir, filename)

                gt_path = os.path.join(
                    self.data_dir,
                    subdir,
                    self.ground_truth_dir,
                    filename,
                )
                gt_dataset = {}
                with open(gt_path, "r", encoding="utf-8") as gt_file:
                    for line in gt_file:
                        gt_data = json5.loads(line)
                        gt_dataset[gt_data["id"]] = gt_data

                with open(file_path, "r", encoding="utf-8") as f:
                    for line in f:
                        data = json5.loads(line)
                        gt = gt_dataset[data["id"]]
                        gt.pop("id", None)
                        data["ground_truth"] = gt["ground_truth"]
                        data["mile_stone"] = gt["mile_stone"]
                        data["language"] = subdir.rsplit(
                            "_",
                            maxsplit=1,
                        )[-1]
                        data["tags"] = {
                            "language": data["language"],
                            "category": filename.split(
                                ".",
                                maxsplit=1,
                            )[0].removeprefix(
                                "data_",
                            ),
                        }
                        dataset.append(data)

        return dataset

    def _verify_data(self) -> bool:
        """Verify the data completeness and integrity."""
        for subdir in self.data_subdir:
            for filename in self.data_files:
                file_path = os.path.join(self.data_dir, subdir, filename)
                if not os.path.exists(file_path):
                    return False

                gt_path = os.path.join(
                    self.data_dir,
                    subdir,
                    self.ground_truth_dir,
                    filename,
                )
                if not os.path.exists(gt_path):
                    return False

        return True

    def _download_data(self) -> None:
        """Download the data from the URL"""
        for subdir in self.data_subdir:
            subdir_path = os.path.join(self.data_dir, subdir)
            subdir_gt_path = os.path.join(subdir_path, self.ground_truth_dir)
            os.makedirs(subdir_path, exist_ok=True)
            os.makedirs(subdir_gt_path, exist_ok=True)
            for filename in tqdm(
                self.data_files,
                desc=f"Downloading {subdir}",
            ):
                response = requests.get(
                    f"{self.data_dir_url}/{subdir}/{filename}",
                )
                response.raise_for_status()
                with open(os.path.join(subdir_path, filename), "wb") as f:
                    f.write(response.content)

                gt_response = requests.get(
                    f"{self.data_dir_url}/{subdir}/"
                    f"{self.ground_truth_dir}/{filename}",
                )
                gt_response.raise_for_status()
                with open(os.path.join(subdir_gt_path, filename), "wb") as f:
                    f.write(gt_response.content)

    @staticmethod
    def _data_to_task(item: dict) -> Task:
        """Convert a dataset item to a Task object."""
        # Start the simulated phone and load initial configuration
        ace_phone = ACEPhone()
        ace_phone.load_initial_config(item["initial_config"])

        # Obtain tool functions
        tools: list[tuple] = []
        for function_schema in item["function"]:
            name = function_schema["name"]

            # Handle the schema differences
            formatted_schema = json.loads(
                json.dumps(
                    function_schema,
                ).replace(
                    '"type": "dict"',
                    '"type": "object"',
                ),
            )

            tool_function = ace_phone.get_tool_function(name)
            tools.append(
                (
                    tool_function,
                    {
                        "type": "function",
                        "function": formatted_schema,
                    },
                ),
            )

        return Task(
            id=item["id"],
            input=item["question"],
            ground_truth={
                "state": item["ground_truth"],
                "mile_stone": item.get("mile_stone", []),
            },
            tags=item.get("tags", {}),
            metrics=[
                ACEAccuracy(item["ground_truth"]),
                ACEProcessAccuracy(item["mile_stone"]),
            ],
            metadata={
                # The phone is used to extract the final state after finishing
                # the task.
                "phone": ace_phone,
                # The provided tools for this task, used to equip the agent
                "tools": tools,
            },
        )

    def __iter__(self) -> Generator[Task, None, None]:
        """Iterate over the benchmark."""
        for item in self.dataset:
            yield self._data_to_task(item)

    def __getitem__(self, index: int) -> Task:
        """Get a task by index."""
        return self._data_to_task(self.dataset[index])

    def __len__(self) -> int:
        """Get the length of the benchmark."""
        return len(self.dataset)