agentscope.evaluate

The evaluation module in AgentScope.

class BenchmarkBase[源代码]

基类:ABC

The base class for benchmark evaluation.

__init__(name, description)[源代码]

Initialize the benchmark.

参数:
  • name (str) -- The name of the benchmark.

  • description (str) -- A brief description of the benchmark.

返回类型:

None

name: str

The name of the benchmark.

description: str

The description of the benchmark.

class EvaluatorBase[源代码]

基类:object

The class that runs the evaluation process.

__init__(name, benchmark, n_repeat, storage)[源代码]

Initialize the evaluator.

参数:
  • name (str) -- The name of this evaluator.

  • benchmark (BenchmarkBase) -- (BenchmarkBase): A benchmark instance inheriting from BenchmarkBase that defines the evaluation dataset.

  • n_repeat (int) -- How many times to repeat the evaluation for each task.

  • storage (EvaluatorStorageBase) -- A instance inheriting from the child class of EvaluatorStorageBase that supports storing and loading solution output and evaluation results.

返回类型:

None

abstract async run(solution)[源代码]

Run the evaluation and return the results.

参数:

solution (Callable[[Task, Callable], Coroutine[Any, Any, SolutionOutput]]) -- A async function that takes a Task instance and a pre-hook as input and returns a SolutionOutput instance.

返回类型:

None

async aggregate()[源代码]

Aggregate the evaluation results and save an overall result.

返回类型:

None

class RayEvaluator[源代码]

基类:EvaluatorBase

The ray-based evaluator that supports distributed and parallel evaluation.

__init__(name, benchmark, n_repeat, storage, n_workers)[源代码]

Initialize the evaluator.

参数:
返回类型:

None

run_evaluation = <ray.remote_function.RemoteFunction object>
run_solution = <ray.remote_function.RemoteFunction object>
async run(solution)[源代码]

Run the ray-based distributed and parallel evaluation, and get the results.

参数:

solution (Callable[[Task], SolutionOutput]) -- A sync or async function that takes a Task instance as input and returns a SolutionOutput instance.

返回类型:

None

class GeneralEvaluator[源代码]

基类:EvaluatorBase

The general evaluator that support users to debug their evaluation

__init__(name, benchmark, n_repeat, storage, n_workers)[源代码]

Initialize the evaluator.

参数:
返回类型:

None

run_evaluation(task, repeat_id, solution_output)[源代码]

Run the evaluation for a task and solution result.

参数:
返回类型:

None

async run_solution(repeat_id, task, solution)[源代码]

Generate a solution to a task and evaluate.

参数:
返回类型:

None

async run(solution)[源代码]

Run the ray-based distributed and parallel evaluation, and get the results.

参数:

solution (Callable[[Task, Callable], Coroutine[Any, Any, SolutionOutput]]) -- A async function that takes a Task instance and a pre-print hook function as input, returns a SolutionOutput instance.

返回类型:

None

class MetricBase[源代码]

基类:ABC

The base class for _metric in evaluation.

__init__(name, metric_type, description=None, categories=None)[源代码]

Initialize the _metric object.

参数:
  • name (str) -- The name of the metric.

  • metric_type (MetricType) -- The type of the metric, can be either "category" or "numerical", which will determine how to display the result.

  • description (str) -- The description of the metric.

  • categories (list[str] | None, optional) -- The candidate categories. If metric_type is "category", the categories must be provided, otherwise it should be None.

返回类型:

None

abstract __call__(*args, **kwargs)[源代码]

The call function to calculate the _metric result

参数:
  • args (Any)

  • kwargs (Any)

返回类型:

MetricResult

class MetricResult[源代码]

基类:DictMixin

The result of a _metric.

name: str

The metric name.

result: str | float | int

The metric result.

created_at: str

The timestamp when the metric result was created.

message: str | None

An optional message for the metric result, can be used to provide additional information or context about the result.

metadata: dict[str, str | int | float | bool | None | list[JSONSerializableObject] | dict[str, JSONSerializableObject]] | None = None

Optional metadata for the metric result, can be used to store additional information related to the metric result.

__init__(name, result, created_at=<factory>, message=<factory>, metadata=None)
参数:
  • name (str)

  • result (str | float | int)

  • created_at (str)

  • message (str | None)

  • metadata (dict[str, str | int | float | bool | None | list[JSONSerializableObject] | dict[str, JSONSerializableObject]] | None)

返回类型:

None

class MetricType[源代码]

基类:str, Enum

The metric type enum.

CATEGORY = 'category'

The metric result is a category, e.g. "pass" or "fail".

NUMERICAL = 'numerical'

The metric result is a numerical value, e.g. 0.95 or 100.

__new__(value)
class EvaluatorStorageBase[源代码]

基类:object

Used to store the solution results and evaluation results to support resuming the evaluation process

abstract save_solution_result(task_id, repeat_id, output, **kwargs)[源代码]

Save the solution result.

参数:
  • task_id (str) -- The task ID.

  • repeat_id (str) -- The repeat ID for the task, usually the index of the repeat evaluation.

  • output (SolutionOutput) -- The solution output to be saved.

  • kwargs (Any)

返回类型:

None

abstract get_evaluation_result(task_id, repeat_id, metric_name)[源代码]

Get the evaluation result by the given task id and repeat id

参数:
  • task_id (str) -- The task ID.

  • repeat_id (str) -- The repeat ID for the task, usually the index of the repeat evaluation.

  • metric_name (str) -- The metric name.

返回:

The evaluation result for the given task and repeat ID.

返回类型:

MetricResult

abstract save_evaluation_result(task_id, repeat_id, evaluation, **kwargs)[源代码]

Save the evaluation result.

参数:
  • task_id (str) -- The task ID.

  • repeat_id (str) -- The repeat ID for the task, usually the index of the repeat evaluation.

  • evaluation (MetricResult) -- The evaluation result to be saved.

  • kwargs (Any)

返回类型:

None

abstract get_solution_result(task_id, repeat_id, **kwargs)[源代码]

Get the solution result for the given task and repeat id.

参数:
  • task_id (str) -- The task ID.

  • repeat_id (str) -- The repeat ID for the task, usually the index of the repeat evaluation.

  • kwargs (Any)

返回:

The solution output for the given task and repeat ID.

返回类型:

SolutionOutput

abstract solution_result_exists(task_id, repeat_id)[源代码]

Check if the solution for the given task and repeat is finished.

参数:
  • task_id (str) -- The task ID.

  • repeat_id (str) -- The repeat ID for the task, usually the index of the repeat evaluation.

返回:

True if the solution result file exists, False otherwise.

返回类型:

bool

abstract evaluation_result_exists(task_id, repeat_id, metric_name)[源代码]

Check if the evaluation result for the given solution and metric is finished.

参数:
  • task_id (str) -- The task ID.

  • repeat_id (str) -- The repeat ID for the task, usually the index of the repeat evaluation.

  • metric_name (str) -- The name of the metric.

返回:

True if the evaluation result file exists, False otherwise.

返回类型:

bool

abstract save_aggregation_result(aggregation_result, **kwargs)[源代码]

Save the aggregation result.

参数:
  • aggregation_result (dict) -- A dictionary containing the aggregation result.

  • kwargs (Any)

返回类型:

None

abstract aggregation_result_exists(**kwargs)[源代码]

Check if the aggregation result exists

返回:

True if the aggregation result file exists.

返回类型:

bool

参数:

kwargs (Any)

abstract save_evaluation_meta(meta_info)[源代码]

Save the evaluation meta information.

参数:

meta_info (dict) -- A dictionary containing the meta information.

返回类型:

None

abstract get_agent_pre_print_hook(task_id, repeat_id)[源代码]

Get a pre-print hook function for the agent to save the agent printing in the evaluation storage.

参数:
  • task_id (str) -- The task ID.

  • repeat_id (str) -- The repeat ID for the task, usually the index of the repeat evaluation.

返回:

A hook function that takes an AgentBase instance and a keyword arguments dictionary as input, saving the agent's printing Msg into the evaluation storage.

返回类型:

Callable[[AgentBase, dict], None]

class FileEvaluatorStorage[源代码]

基类:EvaluatorStorageBase

File system based evaluator storage, providing methods to save and retrieve evaluation results. So that the evaluation process can be resumed from the last saved state.

The files are organized in a directory structure: - save_dir/

  • evaluation_result.json

  • evaluation_meta.json

  • {task_id}/
    • {repeat_id}/
      • solution.json

      • evaluation/
        • {metric_name}.json

SOLUTION_FILE_NAME = 'solution.json'
EVALUATION_DIR_NAME = 'evaluation'
EVALUATION_RESULT_FILE = 'evaluation_result.json'
EVALUATION_META_FILE = 'evaluation_meta.json'
AGENT_PRINTING_LOG = 'logging.txt'
__init__(save_dir)[源代码]

Initialize the file evaluator storage.

参数:

save_dir (str)

返回类型:

None

save_solution_result(task_id, repeat_id, output, **kwargs)[源代码]

Save the solution result.

参数:
  • task_id (str) -- The task ID.

  • repeat_id (str) -- The repeat ID for the task, usually the index of the repeat evaluation.

  • output (SolutionOutput) -- The solution output to be saved.

  • kwargs (Any)

返回类型:

None

save_evaluation_result(task_id, repeat_id, evaluation, **kwargs)[源代码]

Save the evaluation result.

参数:
  • task_id (str) -- The task ID.

  • repeat_id (str) -- The repeat ID for the task, usually the index of the repeat evaluation.

  • evaluation (MetricResult) -- The evaluation result to be saved.

  • kwargs (Any)

返回类型:

None

get_evaluation_result(task_id, repeat_id, metric_name)[源代码]

Get the evaluation result by the given task id and repeat id

参数:
  • task_id (str) -- The task ID.

  • repeat_id (str) -- The repeat ID for the task, usually the index of the repeat evaluation.

  • metric_name (str) -- The metric name.

返回:

The evaluation result for the given task and repeat ID.

返回类型:

MetricResult

get_solution_result(task_id, repeat_id, **kwargs)[源代码]

Get the solution result for the given task and repeat id from the file system.

参数:
  • task_id (str) -- The task ID.

  • repeat_id (str) -- The repeat ID for the task, usually the index of the repeat evaluation.

  • kwargs (Any)

抛出:

FileNotFoundError -- If the solution result file does not exist for the given task and repeat ID.

返回:

The solution output for the given task and repeat ID.

返回类型:

SolutionOutput

solution_result_exists(task_id, repeat_id)[源代码]

Check if the solution for the given task and repeat is finished.

参数:
  • task_id (str) -- The task ID.

  • repeat_id (str) -- The repeat ID for the task, usually the index of the repeat evaluation.

返回:

True if the solution result file exists, False otherwise.

返回类型:

bool

evaluation_result_exists(task_id, repeat_id, metric_name)[源代码]

Check if the evaluation result for the given solution and metric is finished.

参数:
  • task_id (str) -- The task ID.

  • repeat_id (str) -- The repeat ID for the task, usually the index of the repeat evaluation.

  • metric_name (str) -- The name of the metric.

返回:

True if the evaluation result file exists, False otherwise.

返回类型:

bool

save_aggregation_result(aggregation_result, **kwargs)[源代码]

Save the aggregation result.

参数:
  • aggregation_result (dict) -- A dictionary containing the aggregation result.

  • kwargs (Any)

返回类型:

None

aggregation_result_exists(**kwargs)[源代码]

Check if the aggregation result exists

返回:

True if the aggregation result file exists.

返回类型:

bool

参数:

kwargs (Any)

save_evaluation_meta(meta_info)[源代码]

Save the evaluation meta information.

参数:

meta_info (dict) -- A dictionary containing the meta information.

返回类型:

None

get_agent_pre_print_hook(task_id, repeat_id)[源代码]

Get a pre-print hook function for the agent to save the agent printing in the evaluation storage.

参数:
  • task_id (str) -- The task ID.

  • repeat_id (str) -- The repeat ID for the task, usually the index of the repeat evaluation.

返回:

A hook function that takes an AgentBase instance and a keyword arguments dictionary as input, saving the agent's printing Msg into the evaluation storage.

返回类型:

Callable[[AgentBase, dict], None]

class Task[源代码]

基类:object

The base class for task in evaluation.

id: str

The unique identifier for the task.

input: str | int | float | bool | None | list[JSONSerializableObject] | dict[str, JSONSerializableObject]

The task input, which should be a JSON serializable object.

ground_truth: str | int | float | bool | None | list[JSONSerializableObject] | dict[str, JSONSerializableObject]

The task ground truth if exists, which should be a JSON serializable object.

__init__(id, input, ground_truth, metrics, tags=<factory>, metadata=<factory>)
参数:
  • id (str)

  • input (str | int | float | bool | None | list[JSONSerializableObject] | dict[str, JSONSerializableObject])

  • ground_truth (str | int | float | bool | None | list[JSONSerializableObject] | dict[str, JSONSerializableObject])

  • metrics (list[MetricBase])

  • tags (dict[str, str] | None)

  • metadata (dict[str, Any] | None)

返回类型:

None

metrics: list[MetricBase]

The metrics to evaluate the task, which should be a list of MetricBase objects.

tags: dict[str, str] | None

"easy", "cate": "math"}`.

Type:

Tags to categorize the task, e.g. `{"difficulty"

metadata: dict[str, Any] | None

Additional metadata for the task.

evaluate(solution)[源代码]

Evaluate the task with the given solution.

参数:

solution (SolutionOutput) -- The solution to evaluate the task with.

返回:

The result of the evaluation.

返回类型:

MetricResult

class SolutionOutput[源代码]

基类:DictMixin

The output of a solution in evaluation task

success: bool

Indicates whether the solution is executed successfully. When the solution raise exception, this should be set to False.

output: str | int | float | bool | None | list[JSONSerializableObject] | dict[str, JSONSerializableObject]

The final output of the solution.

trajectory: list[ToolUseBlock | ToolResultBlock | TextBlock]

The tool calls and results trajectory

__init__(success, output, trajectory, meta=<factory>)
参数:
  • success (bool)

  • output (str | int | float | bool | None | list[JSONSerializableObject] | dict[str, JSONSerializableObject])

  • trajectory (list[ToolUseBlock | ToolResultBlock | TextBlock])

  • meta (dict[str, Any] | None)

返回类型:

None

meta: dict[str, Any] | None

Additional metadata for the solution

class ACEBenchmark[源代码]

基类:BenchmarkBase

The ACE benchmark for evaluating AI agents.

data_dir_url: str = 'https://raw.githubusercontent.com/ACEBench/ACEBench/main/data_all'

The URL to the data dir

data_subdir: list[str] = ['data_zh']
ground_truth_dir: str = 'possible_answer'
data_files: list[str] = ['data_agent_multi_step.json', 'data_agent_multi_turn.json']

The data filenames

__init__(data_dir)[源代码]

Initialize the ACEBenchmark

参数:

data_dir (str) -- The directory where the dataset is downloaded and saved.

返回类型:

None

class ACEAccuracy[源代码]

基类:MetricBase

The ace benchmark metric

__init__(state)[源代码]

Initialize the _metric object.

参数:

state (list[dict])

返回类型:

None

__call__(solution)[源代码]

Calculate the metric result.

参数:

solution (SolutionOutput)

返回类型:

MetricResult

class ACEProcessAccuracy[源代码]

基类:MetricBase

The ace benchmark process accuracy metric.

__init__(mile_stone)[源代码]

Initialize the AceBench process accuracy metric.

参数:

mile_stone (list[str])

返回类型:

None

__call__(solution)[源代码]

Calculate the metric result.

参数:

solution (SolutionOutput)

返回类型:

MetricResult

class ACEPhone[源代码]

基类:object

Simulate a user phone with various apps and functionalities in ACEBench. The code is implemented with reference to the ACEBench.

__init__()[源代码]

Initialize the shared state and apps for the ACEPhone.

返回类型:

None

turn_on_wifi()[源代码]

开启WiFi连接。

返回类型:

dict[str, bool | str]

login_device()[源代码]

登录设备。

返回类型:

dict[str, bool | str]

load_initial_config(initial_config)[源代码]

Load the initial config from the application configuration.

参数:

initial_config (dict)

返回类型:

None

get_current_state()[源代码]

Follow ACEBench to get the current state of the ACEPhone.

返回类型:

list[dict]

get_tool_function(name)[源代码]

Get a tool function by name.

参数:

name (str)

返回类型:

Callable