Source code for agentscope.parsers.json_object_parser

# -*- coding: utf-8 -*-
"""The parser for JSON object in the model response."""
import inspect
import json
from copy import deepcopy
from typing import Optional, Any, List, Sequence, Union

from loguru import logger
from pydantic import BaseModel

from ..exception import (
    TagNotFoundError,
    JsonParsingError,
    JsonTypeError,
    RequiredFieldNotFoundError,
)
from ..models import ModelResponse
from ..parsers import ParserBase
from ..parsers.parser_base import DictFilterMixin
from ..utils.common import _join_str_with_comma_and



[docs]
class MarkdownJsonObjectParser(ParserBase):
    """A parser to parse the response text to a json object."""

    name: str = "json block"
    """The name of the parser."""

    tag_begin: str = "```json"
    """Opening tag for a code block."""

    content_hint: str = "{your_json_object}"
    """The hint of the content."""

    tag_end: str = "```"
    """Closing end for a code block."""

    _format_instruction = (
        "You should respond a json object in a json fenced code block as "
        "follows:\n```json\n{content_hint}\n```"
    )
    """The instruction for the format of the json object."""


[docs]
    def __init__(self, content_hint: Optional[Any] = None) -> None:
        """Initialize the parser with the content hint.

        Args:
            content_hint (`Optional[Any]`, defaults to `None`):
                The hint used to remind LLM what should be fill between the
                tags. If it is a string, it will be used as the content hint
                directly. If it is a dict, it will be converted to a json
                string and used as the content hint.
        """
        if content_hint is not None:
            if isinstance(content_hint, str):
                self.content_hint = content_hint
            else:
                self.content_hint = json.dumps(
                    content_hint,
                    ensure_ascii=False,
                )



[docs]
    def parse(self, response: ModelResponse) -> ModelResponse:
        """Parse the response text to a json object, and fill it in the parsed
        field in the response object."""

        # extract the content and try to fix the missing tags by hand
        try:
            extract_text = self._extract_first_content_by_tag(
                response,
                self.tag_begin,
                self.tag_end,
            )
        except TagNotFoundError as e:
            # Try to fix the missing tag error by adding the tag
            try:
                response_copy = deepcopy(response)

                # Fix the missing tags
                if e.missing_begin_tag:
                    response_copy.text = (
                        self.tag_begin + "\n" + response_copy.text
                    )
                if e.missing_end_tag:
                    response_copy.text = response_copy.text + self.tag_end

                # Try again to extract the content
                extract_text = self._extract_first_content_by_tag(
                    response_copy,
                    self.tag_begin,
                    self.tag_end,
                )

                # replace the response with the fixed one
                response.text = response_copy.text

                logger.debug("Fix the missing tags by adding them manually.")

            except TagNotFoundError:
                # Raise the original error if the missing tags cannot be fixed
                raise e from None

        # Parse the content into JSON object
        try:
            parsed_json = json.loads(extract_text)
            response.parsed = parsed_json
            return response
        except json.decoder.JSONDecodeError as e:
            raw_response = f"{self.tag_begin}{extract_text}{self.tag_end}"
            raise JsonParsingError(
                f"The content between {self.tag_begin} and {self.tag_end} "
                f"MUST be a JSON object."
                f'When parsing "{raw_response}", an error occurred: {e}',
                raw_response=raw_response,
            ) from None


    @property
    def format_instruction(self) -> str:
        """Get the format instruction for the json object, if the
        format_example is provided, it will be used as the example.
        """
        return self._format_instruction.format(
            content_hint=self.content_hint,
        )




[docs]
class MarkdownJsonDictParser(MarkdownJsonObjectParser, DictFilterMixin):
    """A class used to parse a JSON dictionary object in a markdown fenced
    code"""

    name: str = "json block"
    """The name of the parser."""

    tag_begin: str = "```json"
    """Opening tag for a code block."""

    content_hint: str = "{your_json_dictionary}"
    """The hint of the content."""

    tag_end: str = "```"
    """Closing end for a code block."""

    _format_instruction = (
        "Respond a JSON dictionary in a markdown's fenced code block as "
        "follows:\n```json\n{content_hint}\n```"
    )
    """The instruction for the format of the json object."""

    _format_instruction_with_schema = (
        "Respond a JSON dictionary in a markdown's fenced code block as "
        "follows:\n"
        "```json\n"
        "{content_hint}\n"
        "```\n"
        "The generated JSON dictionary MUST follow this schema: \n"
        "{schema}"
    )
    """The schema instruction for the format of the json object."""

    required_keys: List[str]
    """A list of required keys in the JSON dictionary object. If the response
    misses any of the required keys, it will raise a
    RequiredFieldNotFoundError."""


[docs]
    def __init__(
        self,
        content_hint: Optional[Any] = None,
        required_keys: List[str] = None,
        keys_to_memory: Union[str, bool, Sequence[str]] = True,
        keys_to_content: Union[str, bool, Sequence[str]] = True,
        keys_to_metadata: Union[str, bool, Sequence[str]] = False,
    ) -> None:
        """Initialize the parser with the content hint.

        Args:
            content_hint (`Optional[Any]`, defaults to `None`):
                The hint used to remind LLM what should be fill between the
                tags. If it is a string, it will be used as the content hint
                directly. If it is a dict, it will be converted to a json
                string and used as the content hint. If it's a Pydantic model,
                the schema will be displayed in the instruction.
            required_keys (`List[str]`, defaults to `[]`):
                A list of required keys in the JSON dictionary object. If the
                response misses any of the required keys, it will raise a
                RequiredFieldNotFoundError.
            keys_to_memory (`Optional[Union[str, bool, Sequence[str]]]`,
            defaults to `True`):
                The key or keys to be filtered in `to_memory` method. If
                it's
                - `False`, `None` will be returned in the `to_memory` method
                - `str`, the corresponding value will be returned
                - `List[str]`, a filtered dictionary will be returned
                - `True`, the whole dictionary will be returned
            keys_to_content (`Optional[Union[str, bool, Sequence[str]]]`,
            defaults to `True`):
                The key or keys to be filtered in `to_content` method. If
                it's
                - `False`, `None` will be returned in the `to_content` method
                - `str`, the corresponding value will be returned
                - `List[str]`, a filtered dictionary will be returned
                - `True`, the whole dictionary will be returned
            keys_to_metadata (`Optional[Union[str, bool, Sequence[str]]`,
            defaults to `False`):
                The key or keys to be filtered in `to_metadata` method. If
                it's
                - `False`, `None` will be returned in the `to_metadata` method
                - `str`, the corresponding value will be returned
                - `List[str]`, a filtered dictionary will be returned
                - `True`, the whole dictionary will be returned

        """
        self.pydantic_class = None

        # Initialize the content_hint according to the type of content_hint
        if inspect.isclass(content_hint) and issubclass(
            content_hint,
            BaseModel,
        ):
            self.pydantic_class = content_hint
            self.content_hint = "{a_JSON_dictionary}"
        elif content_hint is not None:
            if isinstance(content_hint, str):
                self.content_hint = content_hint
            else:
                self.content_hint = json.dumps(
                    content_hint,
                    ensure_ascii=False,
                )

        # Initialize the mixin class to allow filtering the parsed response
        DictFilterMixin.__init__(
            self,
            keys_to_memory=keys_to_memory,
            keys_to_content=keys_to_content,
            keys_to_metadata=keys_to_metadata,
        )

        self.required_keys = required_keys or []


    @property
    def format_instruction(self) -> str:
        """Get the format instruction for the json object, if the
        format_example is provided, it will be used as the example.
        """
        if self.pydantic_class is None:
            return self._format_instruction.format(
                content_hint=self.content_hint,
            )
        else:
            return self._format_instruction_with_schema.format(
                content_hint=self.content_hint,
                schema=self.pydantic_class.model_json_schema(),
            )


[docs]
    def parse(self, response: ModelResponse) -> ModelResponse:
        """Parse the text field of the response to a JSON dictionary object,
        store it in the parsed field of the response object, and check if the
        required keys exists.
        """
        # Parse the JSON object
        response = super().parse(response)

        if not isinstance(response.parsed, dict):
            # If not a dictionary, raise an error
            raise JsonTypeError(
                "A JSON dictionary object is wanted, "
                f"but got {type(response.parsed)} instead.",
                response.text,
            )

        # Requirement checking by Pydantic
        if self.pydantic_class is not None:
            try:
                response.parsed = self.pydantic_class(
                    **response.parsed,
                ).model_dump()
            except Exception as e:
                raise JsonParsingError(
                    message=str(e),
                    raw_response=response.text,
                ) from None

        # Check if the required keys exist
        keys_missing = []
        for key in self.required_keys:
            if key not in response.parsed:
                keys_missing.append(key)

        if len(keys_missing) != 0:
            raise RequiredFieldNotFoundError(
                f"Missing required "
                f"field{'' if len(keys_missing) == 1 else 's'} "
                f"{_join_str_with_comma_and(keys_missing)} in the JSON "
                f"dictionary object.",
                response.text,
            )

        return response