Source code for agentscope.parsers.tagged_content_parser

# -*- coding: utf-8 -*-
"""The parser for tagged content in the model response."""
import json
from typing import Union, Sequence, Optional, List

from agentscope.exception import JsonParsingError, TagNotFoundError
from agentscope.models import ModelResponse
from agentscope.parsers import ParserBase
from agentscope.parsers.parser_base import DictFilterMixin



[docs]
class TaggedContent:
    """A tagged content object to store the tag name, tag begin, content hint
    and tag end."""

    name: str
    """The name of the tagged content, which will be used as the key in
    extracted dictionary."""

    tag_begin: str
    """The beginning tag."""

    content_hint: str
    """The hint of the content."""

    tag_end: str
    """The ending tag."""

    parse_json: bool
    """Whether to parse the content as a json object."""


[docs]
    def __init__(
        self,
        name: str,
        tag_begin: str,
        content_hint: str,
        tag_end: str,
        parse_json: bool = False,
    ) -> None:
        """Initialize the tagged content object.

        Args:
            name (`str`):
                The name of the tagged content.
            tag_begin (`str`):
                The beginning tag.
            content_hint (`str`):
                The hint of the content.
            tag_end (`str`):
                The ending tag.
            parse_json (`bool`, defaults to `False`):
                Whether to parse the content as a json object.
        """

        self.name = name
        self.tag_begin = tag_begin
        self.content_hint = content_hint
        self.tag_end = tag_end
        self.parse_json = parse_json


    def __str__(self) -> str:
        """Return the tagged content as a string."""
        return f"{self.tag_begin}{self.content_hint}{self.tag_end}"




[docs]
class MultiTaggedContentParser(ParserBase, DictFilterMixin):
    """Parse response text by multiple tags, and return a dict of their
    content. Asking llm to generate JSON dictionary object directly maybe not a
    good idea due to involving escape characters and other issues. So we can
    ask llm to generate text with tags, and then parse the text to get the
    final JSON dictionary object.
    """

    format_instruction = (
        "Respond with specific tags as outlined below{json_required_hint}\n"
        "{tag_lines_format}"
    )
    """The instruction for the format of the tagged content."""

    json_required_hint = ", and the content between {} MUST be a JSON object:"
    """If a tagged content is required to be a JSON object by `parse_json`
    equals to `True`, this instruction will be used to remind the model to
    generate JSON object."""


[docs]
    def __init__(
        self,
        *tagged_contents: TaggedContent,
        keys_to_memory: Optional[Union[str, bool, Sequence[str]]] = True,
        keys_to_content: Optional[Union[str, bool, Sequence[str]]] = True,
        keys_to_metadata: Optional[Union[str, bool, Sequence[str]]] = False,
        keys_allow_missing: Optional[List[str]] = None,
    ) -> None:
        """Initialize the parser with tags.

        Args:
            *tagged_contents (`dict[str, Tuple[str, str]]`):
                Multiple TaggedContent objects, each object contains the tag
                name, tag begin, content hint and tag end. The name will be
                used as the key in the extracted dictionary.
            required_keys (`Optional[List[str]]`, defaults to `None`):
                A list of required
            keys_to_memory (`Optional[Union[str, bool, Sequence[str]]]`,
            defaults to `True`):
                The key or keys to be filtered in `to_memory` method. If
                it's
                - `False`, `None` will be returned in the `to_memory` method
                - `str`, the corresponding value will be returned
                - `List[str]`, a filtered dictionary will be returned
                - `True`, the whole dictionary will be returned
            keys_to_content (`Optional[Union[str, bool, Sequence[str]]`,
            defaults to `True`):
                The key or keys to be filtered in `to_content` method. If
                it's
                - `False`, `None` will be returned in the `to_content` method
                - `str`, the corresponding value will be returned
                - `List[str]`, a filtered dictionary will be returned
                - `True`, the whole dictionary will be returned
            keys_to_metadata (`Optional[Union[str, bool, Sequence[str]]]`,
            defaults to `False`):
                The key or keys to be filtered in `to_metadata` method. If
                it's
                - `False`, `None` will be returned in the `to_metadata` method
                - `str`, the corresponding value will be returned
                - `List[str]`, a filtered dictionary will be returned
                - `True`, the whole dictionary will be returned
            keys_allow_missing (`Optional[List[str]]`, defaults to `None`):
                A list of keys that are allowed to be missing in the response.
        """
        # Initialize the mixin class
        DictFilterMixin.__init__(
            self,
            keys_to_memory=keys_to_memory,
            keys_to_content=keys_to_content,
            keys_to_metadata=keys_to_metadata,
        )

        self.keys_allow_missing = keys_allow_missing

        self.tagged_contents = list(tagged_contents)

        # Prepare the format instruction according to the tagged contents
        tag_lines = "\n".join([str(_) for _ in tagged_contents])

        # Prepare hint for the tagged contents that requires a JSON object.
        json_required_tags = ", ".join(
            [
                f"{_.tag_begin} and {_.tag_end}"
                for _ in tagged_contents
                if _.parse_json
            ],
        )
        if json_required_tags != "":
            json_required_hint = self.json_required_hint.format(
                json_required_tags,
            )
        else:
            json_required_hint = ": "

        self.format_instruction = self.format_instruction.format(
            json_required_hint=json_required_hint,
            tag_lines_format=tag_lines,
        )



[docs]
    def parse(self, response: ModelResponse) -> ModelResponse:
        """Parse the response text by tags, and return a dict of their content
        in the parsed field of the model response object. If the tagged content
        requires to parse as a JSON object by `parse_json` equals to `True`, it
        will be parsed as a JSON object by `json.loads`."""

        tag_to_content = {}
        for tagged_content in self.tagged_contents:
            tag_begin = tagged_content.tag_begin
            tag_end = tagged_content.tag_end

            try:
                extract_content = self._extract_first_content_by_tag(
                    response,
                    tag_begin,
                    tag_end,
                )

                if tagged_content.parse_json:
                    try:
                        extract_content = json.loads(extract_content)
                    except json.decoder.JSONDecodeError as e:
                        raw_response = f"{tag_begin}{extract_content}{tag_end}"
                        raise JsonParsingError(
                            f"The content between "
                            f"{tagged_content.tag_begin} and "
                            f"{tagged_content.tag_end} should be a JSON "
                            f'object. An error "{e}" occurred when parsing: '
                            f"{raw_response}",
                            raw_response=raw_response,
                        ) from None

                tag_to_content[tagged_content.name] = extract_content

            except TagNotFoundError as e:
                # if the key is allowed to be missing, skip the error
                if (
                    self.keys_allow_missing is not None
                    and tagged_content.name in self.keys_allow_missing
                ):
                    continue

                raise e from None

        response.parsed = tag_to_content
        return response