agentscope.evaluate

The evaluation module in AgentScope.

class BenchmarkBase[source]

Bases: ABC

The base class for benchmark evaluation.

__init__(name, description)[source]

Initialize the benchmark.

Parameters:
  • name (str) – The name of the benchmark.

  • description (str) – A brief description of the benchmark.

Return type:

None

name: str

The name of the benchmark.

description: str

The description of the benchmark.

class EvaluatorBase[source]

Bases: object

The class that runs the evaluation process.

__init__(name, benchmark, n_repeat, storage)[source]

Initialize the evaluator.

Parameters:
  • name (str) – The name of this evaluator.

  • benchmark (BenchmarkBase) – (BenchmarkBase): A benchmark instance inheriting from BenchmarkBase that defines the evaluation dataset.

  • n_repeat (int) – How many times to repeat the evaluation for each task.

  • storage (EvaluatorStorageBase) – A instance inheriting from the child class of EvaluatorStorageBase that supports storing and loading solution output and evaluation results.

Return type:

None

abstract async run(solution)[source]

Run the evaluation and return the results.

Parameters:

solution (Callable[[Task, Callable], Coroutine[Any, Any, SolutionOutput]]) – A async function that takes a Task instance and a pre-hook as input and returns a SolutionOutput instance.

Return type:

None

async aggregate()[source]

Aggregate the evaluation results and save an overall result.

Return type:

None

class RayEvaluator[source]

Bases: EvaluatorBase

The ray-based evaluator that supports distributed and parallel evaluation.

__init__(name, benchmark, n_repeat, storage, n_workers)[source]

Initialize the evaluator.

Parameters:
Return type:

None

run_evaluation = <ray.remote_function.RemoteFunction object>
run_solution = <ray.remote_function.RemoteFunction object>
async run(solution)[source]

Run the ray-based distributed and parallel evaluation, and get the results.

Parameters:

solution (Callable[[Task], SolutionOutput]) – A sync or async function that takes a Task instance as input and returns a SolutionOutput instance.

Return type:

None

class GeneralEvaluator[source]

Bases: EvaluatorBase

The general evaluator that support users to debug their evaluation

__init__(name, benchmark, n_repeat, storage, n_workers)[source]

Initialize the evaluator.

Parameters:
Return type:

None

run_evaluation(task, repeat_id, solution_output)[source]

Run the evaluation for a task and solution result.

Parameters:
Return type:

None

async run_solution(repeat_id, task, solution)[source]

Generate a solution to a task and evaluate.

Parameters:
Return type:

None

async run(solution)[source]

Run the ray-based distributed and parallel evaluation, and get the results.

Parameters:

solution (Callable[[Task, Callable], Coroutine[Any, Any, SolutionOutput]]) – A async function that takes a Task instance and a pre-print hook function as input, returns a SolutionOutput instance.

Return type:

None

class MetricBase[source]

Bases: ABC

The base class for _metric in evaluation.

__init__(name, metric_type, description=None, categories=None)[source]

Initialize the _metric object.

Parameters:
  • name (str) – The name of the metric.

  • metric_type (MetricType) – The type of the metric, can be either “category” or “numerical”, which will determine how to display the result.

  • description (str) – The description of the metric.

  • categories (list[str] | None, optional) – The candidate categories. If metric_type is “category”, the categories must be provided, otherwise it should be None.

Return type:

None

abstract __call__(*args, **kwargs)[source]

The call function to calculate the _metric result

Parameters:
  • args (Any)

  • kwargs (Any)

Return type:

MetricResult

class MetricResult[source]

Bases: DictMixin

The result of a _metric.

name: str

The metric name.

result: str | float | int

The metric result.

created_at: str

The timestamp when the metric result was created.

message: str | None

An optional message for the metric result, can be used to provide additional information or context about the result.

metadata: dict[str, str | int | float | bool | None | list[JSONSerializableObject] | dict[str, JSONSerializableObject]] | None = None

Optional metadata for the metric result, can be used to store additional information related to the metric result.

__init__(name, result, created_at=<factory>, message=<factory>, metadata=None)
Parameters:
  • name (str)

  • result (str | float | int)

  • created_at (str)

  • message (str | None)

  • metadata (dict[str, str | int | float | bool | None | list[JSONSerializableObject] | dict[str, JSONSerializableObject]] | None)

Return type:

None

class MetricType[source]

Bases: str, Enum

The metric type enum.

CATEGORY = 'category'

The metric result is a category, e.g. “pass” or “fail”.

NUMERICAL = 'numerical'

The metric result is a numerical value, e.g. 0.95 or 100.

__new__(value)
class EvaluatorStorageBase[source]

Bases: object

Used to store the solution results and evaluation results to support resuming the evaluation process

abstract save_solution_result(task_id, repeat_id, output, **kwargs)[source]

Save the solution result.

Parameters:
  • task_id (str) – The task ID.

  • repeat_id (str) – The repeat ID for the task, usually the index of the repeat evaluation.

  • output (SolutionOutput) – The solution output to be saved.

  • kwargs (Any)

Return type:

None

abstract get_evaluation_result(task_id, repeat_id, metric_name)[source]

Get the evaluation result by the given task id and repeat id

Parameters:
  • task_id (str) – The task ID.

  • repeat_id (str) – The repeat ID for the task, usually the index of the repeat evaluation.

  • metric_name (str) – The metric name.

Returns:

The evaluation result for the given task and repeat ID.

Return type:

MetricResult

abstract save_evaluation_result(task_id, repeat_id, evaluation, **kwargs)[source]

Save the evaluation result.

Parameters:
  • task_id (str) – The task ID.

  • repeat_id (str) – The repeat ID for the task, usually the index of the repeat evaluation.

  • evaluation (MetricResult) – The evaluation result to be saved.

  • kwargs (Any)

Return type:

None

abstract get_solution_result(task_id, repeat_id, **kwargs)[source]

Get the solution result for the given task and repeat id.

Parameters:
  • task_id (str) – The task ID.

  • repeat_id (str) – The repeat ID for the task, usually the index of the repeat evaluation.

  • kwargs (Any)

Returns:

The solution output for the given task and repeat ID.

Return type:

SolutionOutput

abstract solution_result_exists(task_id, repeat_id)[source]

Check if the solution for the given task and repeat is finished.

Parameters:
  • task_id (str) – The task ID.

  • repeat_id (str) – The repeat ID for the task, usually the index of the repeat evaluation.

Returns:

True if the solution result file exists, False otherwise.

Return type:

bool

abstract evaluation_result_exists(task_id, repeat_id, metric_name)[source]

Check if the evaluation result for the given solution and metric is finished.

Parameters:
  • task_id (str) – The task ID.

  • repeat_id (str) – The repeat ID for the task, usually the index of the repeat evaluation.

  • metric_name (str) – The name of the metric.

Returns:

True if the evaluation result file exists, False otherwise.

Return type:

bool

abstract save_aggregation_result(aggregation_result, **kwargs)[source]

Save the aggregation result.

Parameters:
  • aggregation_result (dict) – A dictionary containing the aggregation result.

  • kwargs (Any)

Return type:

None

abstract aggregation_result_exists(**kwargs)[source]

Check if the aggregation result exists

Returns:

True if the aggregation result file exists.

Return type:

bool

Parameters:

kwargs (Any)

abstract save_evaluation_meta(meta_info)[source]

Save the evaluation meta information.

Parameters:

meta_info (dict) – A dictionary containing the meta information.

Return type:

None

abstract get_agent_pre_print_hook(task_id, repeat_id)[source]

Get a pre-print hook function for the agent to save the agent printing in the evaluation storage.

Parameters:
  • task_id (str) – The task ID.

  • repeat_id (str) – The repeat ID for the task, usually the index of the repeat evaluation.

Returns:

A hook function that takes an AgentBase instance and a keyword arguments dictionary as input, saving the agent’s printing Msg into the evaluation storage.

Return type:

Callable[[AgentBase, dict], None]

class FileEvaluatorStorage[source]

Bases: EvaluatorStorageBase

File system based evaluator storage, providing methods to save and retrieve evaluation results. So that the evaluation process can be resumed from the last saved state.

The files are organized in a directory structure: - save_dir/

  • evaluation_result.json

  • evaluation_meta.json

  • {task_id}/
    • {repeat_id}/
      • solution.json

      • evaluation/
        • {metric_name}.json

SOLUTION_FILE_NAME = 'solution.json'
EVALUATION_DIR_NAME = 'evaluation'
EVALUATION_RESULT_FILE = 'evaluation_result.json'
EVALUATION_META_FILE = 'evaluation_meta.json'
AGENT_PRINTING_LOG = 'logging.txt'
__init__(save_dir)[source]

Initialize the file evaluator storage.

Parameters:

save_dir (str)

Return type:

None

save_solution_result(task_id, repeat_id, output, **kwargs)[source]

Save the solution result.

Parameters:
  • task_id (str) – The task ID.

  • repeat_id (str) – The repeat ID for the task, usually the index of the repeat evaluation.

  • output (SolutionOutput) – The solution output to be saved.

  • kwargs (Any)

Return type:

None

save_evaluation_result(task_id, repeat_id, evaluation, **kwargs)[source]

Save the evaluation result.

Parameters:
  • task_id (str) – The task ID.

  • repeat_id (str) – The repeat ID for the task, usually the index of the repeat evaluation.

  • evaluation (MetricResult) – The evaluation result to be saved.

  • kwargs (Any)

Return type:

None

get_evaluation_result(task_id, repeat_id, metric_name)[source]

Get the evaluation result by the given task id and repeat id

Parameters:
  • task_id (str) – The task ID.

  • repeat_id (str) – The repeat ID for the task, usually the index of the repeat evaluation.

  • metric_name (str) – The metric name.

Returns:

The evaluation result for the given task and repeat ID.

Return type:

MetricResult

get_solution_result(task_id, repeat_id, **kwargs)[source]

Get the solution result for the given task and repeat id from the file system.

Parameters:
  • task_id (str) – The task ID.

  • repeat_id (str) – The repeat ID for the task, usually the index of the repeat evaluation.

  • kwargs (Any)

Raises:

FileNotFoundError – If the solution result file does not exist for the given task and repeat ID.

Returns:

The solution output for the given task and repeat ID.

Return type:

SolutionOutput

solution_result_exists(task_id, repeat_id)[source]

Check if the solution for the given task and repeat is finished.

Parameters:
  • task_id (str) – The task ID.

  • repeat_id (str) – The repeat ID for the task, usually the index of the repeat evaluation.

Returns:

True if the solution result file exists, False otherwise.

Return type:

bool

evaluation_result_exists(task_id, repeat_id, metric_name)[source]

Check if the evaluation result for the given solution and metric is finished.

Parameters:
  • task_id (str) – The task ID.

  • repeat_id (str) – The repeat ID for the task, usually the index of the repeat evaluation.

  • metric_name (str) – The name of the metric.

Returns:

True if the evaluation result file exists, False otherwise.

Return type:

bool

save_aggregation_result(aggregation_result, **kwargs)[source]

Save the aggregation result.

Parameters:
  • aggregation_result (dict) – A dictionary containing the aggregation result.

  • kwargs (Any)

Return type:

None

aggregation_result_exists(**kwargs)[source]

Check if the aggregation result exists

Returns:

True if the aggregation result file exists.

Return type:

bool

Parameters:

kwargs (Any)

save_evaluation_meta(meta_info)[source]

Save the evaluation meta information.

Parameters:

meta_info (dict) – A dictionary containing the meta information.

Return type:

None

get_agent_pre_print_hook(task_id, repeat_id)[source]

Get a pre-print hook function for the agent to save the agent printing in the evaluation storage.

Parameters:
  • task_id (str) – The task ID.

  • repeat_id (str) – The repeat ID for the task, usually the index of the repeat evaluation.

Returns:

A hook function that takes an AgentBase instance and a keyword arguments dictionary as input, saving the agent’s printing Msg into the evaluation storage.

Return type:

Callable[[AgentBase, dict], None]

class Task[source]

Bases: object

The base class for task in evaluation.

id: str

The unique identifier for the task.

input: str | int | float | bool | None | list[JSONSerializableObject] | dict[str, JSONSerializableObject]

The task input, which should be a JSON serializable object.

ground_truth: str | int | float | bool | None | list[JSONSerializableObject] | dict[str, JSONSerializableObject]

The task ground truth if exists, which should be a JSON serializable object.

__init__(id, input, ground_truth, metrics, tags=<factory>, metadata=<factory>)
Parameters:
  • id (str)

  • input (str | int | float | bool | None | list[JSONSerializableObject] | dict[str, JSONSerializableObject])

  • ground_truth (str | int | float | bool | None | list[JSONSerializableObject] | dict[str, JSONSerializableObject])

  • metrics (list[MetricBase])

  • tags (dict[str, str] | None)

  • metadata (dict[str, Any] | None)

Return type:

None

metrics: list[MetricBase]

The metrics to evaluate the task, which should be a list of MetricBase objects.

tags: dict[str, str] | None

“easy”, “cate”: “math”}`.

Type:

Tags to categorize the task, e.g. `{“difficulty”

metadata: dict[str, Any] | None

Additional metadata for the task.

evaluate(solution)[source]

Evaluate the task with the given solution.

Parameters:

solution (SolutionOutput) – The solution to evaluate the task with.

Returns:

The result of the evaluation.

Return type:

MetricResult

class SolutionOutput[source]

Bases: DictMixin

The output of a solution in evaluation task

success: bool

Indicates whether the solution is executed successfully. When the solution raise exception, this should be set to False.

output: str | int | float | bool | None | list[JSONSerializableObject] | dict[str, JSONSerializableObject]

The final output of the solution.

trajectory: list[ToolUseBlock | ToolResultBlock | TextBlock]

The tool calls and results trajectory

__init__(success, output, trajectory, meta=<factory>)
Parameters:
  • success (bool)

  • output (str | int | float | bool | None | list[JSONSerializableObject] | dict[str, JSONSerializableObject])

  • trajectory (list[ToolUseBlock | ToolResultBlock | TextBlock])

  • meta (dict[str, Any] | None)

Return type:

None

meta: dict[str, Any] | None

Additional metadata for the solution

class ACEBenchmark[source]

Bases: BenchmarkBase

The ACE benchmark for evaluating AI agents.

data_dir_url: str = 'https://raw.githubusercontent.com/ACEBench/ACEBench/main/data_all'

The URL to the data dir

data_subdir: list[str] = ['data_zh']
ground_truth_dir: str = 'possible_answer'
data_files: list[str] = ['data_agent_multi_step.json', 'data_agent_multi_turn.json']

The data filenames

__init__(data_dir)[source]

Initialize the ACEBenchmark

Parameters:

data_dir (str) – The directory where the dataset is downloaded and saved.

Return type:

None

class ACEAccuracy[source]

Bases: MetricBase

The ace benchmark metric

__init__(state)[source]

Initialize the _metric object.

Parameters:

state (list[dict])

Return type:

None

__call__(solution)[source]

Calculate the metric result.

Parameters:

solution (SolutionOutput)

Return type:

MetricResult

class ACEProcessAccuracy[source]

Bases: MetricBase

The ace benchmark process accuracy metric.

__init__(mile_stone)[source]

Initialize the AceBench process accuracy metric.

Parameters:

mile_stone (list[str])

Return type:

None

__call__(solution)[source]

Calculate the metric result.

Parameters:

solution (SolutionOutput)

Return type:

MetricResult

class ACEPhone[source]

Bases: object

Simulate a user phone with various apps and functionalities in ACEBench. The code is implemented with reference to the ACEBench.

__init__()[source]

Initialize the shared state and apps for the ACEPhone.

Return type:

None

turn_on_wifi()[source]

开启WiFi连接。

Return type:

dict[str, bool | str]

login_device()[source]

登录设备。

Return type:

dict[str, bool | str]

load_initial_config(initial_config)[source]

Load the initial config from the application configuration.

Parameters:

initial_config (dict)

Return type:

None

get_current_state()[source]

Follow ACEBench to get the current state of the ACEPhone.

Return type:

list[dict]

get_tool_function(name)[source]

Get a tool function by name.

Parameters:

name (str)

Return type:

Callable