Source code for agentscope.rag.knowledge

# -*- coding: utf-8 -*-
"""
Base class module for retrieval augmented generation (RAG).
To accommodate the RAG process of different packages,
we abstract the RAG process into four stages:
- data loading: loading data into memory for following processing;
- data indexing and storage: document chunking, embedding generation,
and off-load the data into VDB;
- data retrieval: taking a query and return a batch of documents or
document chunks;
- post-processing of the retrieved data: use the retrieved data to
generate an answer.
"""

import importlib
from abc import ABC, abstractmethod
from typing import Any, Optional, Union
from dataclasses import dataclass, asdict
from loguru import logger
from agentscope.models import ModelWrapperBase



[docs]
@dataclass
class RetrievedChunk:
    """
    Retrieved content with score and meta information

    Attributes:
        score (`float`):
            Similarity score of this retrieved chunk
        content (`Any`):
            The retrieved content
        metadata (`Optional[dict]`):
            The meta data of this retrieved chunk, such as file path
        embedding (`Optional[Any]`)`:
            The embedding of the chunk
        hash (`Optional[str]`):
            The hash of the retrieved content
    """

    score: float = 0.0
    content: Any = None
    metadata: Optional[dict] = None
    embedding: Optional[Any] = None
    hash: Optional[str] = None


[docs]
    def to_dict(self) -> dict:
        """convert object to dict"""
        return asdict(self)





[docs]
class Knowledge(ABC):
    """
    Base class for RAG, CANNOT be instantiated directly
    """

    knowledge_type: str = "base_knowledge"
    """
    A string to identify a knowledge base class
    """

    def __init__(
        self,
        knowledge_id: str,
        emb_model: Any = None,
        knowledge_config: Optional[dict] = None,
        model: Optional[ModelWrapperBase] = None,
        **kwargs: Any,
    ) -> None:
        # pylint: disable=unused-argument
        """
        Initialize the knowledge component

        Args:
        knowledge_id (`str`):
            The id of the knowledge unit.
        emb_model (`ModelWrapperBase`):
            The embedding model used for generate embeddings
        knowledge_config (`dict`):
            The configuration to generate or load the index.
        """
        self.knowledge_id = knowledge_id
        self.emb_model = emb_model
        self.knowledge_config = knowledge_config or {}
        self.postprocessing_model = model

    @abstractmethod
    def _init_rag(
        self,
        **kwargs: Any,
    ) -> Any:
        """
        Initiate the RAG module.
        """


[docs]
    @abstractmethod
    def retrieve(
        self,
        query: Any,
        similarity_top_k: int = None,
        to_list_strs: bool = False,
        **kwargs: Any,
    ) -> list[Union[RetrievedChunk, str]]:
        """
        Retrieve list of content from database (vector stored index) to memory

        Args:
            query (`Any`):
                Query for retrieval
            similarity_top_k (`int`):
                The number of most similar data returned by the
                retriever.
            to_list_strs (`bool`):
                Whether return a list of str

        Returns:
            Return a list with retrieved documents (in strings)
        """



[docs]
    @classmethod
    def default_config(cls, **kwargs: Any) -> dict:
        """
        Return a default config for a knowledge class.

        Args:
            kwargs (`Any`):
                Parameters for config

        Returns:
            dict: a default config of the knowledge class
        """
        raise NotImplementedError(
            f"{cls.__name__} does not have default_config to be defined.",
        )



[docs]
    @classmethod
    def build_knowledge_instance(
        cls,
        knowledge_id: str,
        knowledge_config: Optional[dict] = None,
        **kwargs: Any,
    ) -> "Knowledge":
        """
        A constructor to build a knowledge base instance.

        Args:
            knowledge_id (`str`):
                The id of the knowledge instance.
            knowledge_config (`dict`):
                The configuration to the knowledge instance.

        Returns:
            Knowledge: a Knowledge instance
        """
        raise NotImplementedError(
            f"{knowledge_id} of {cls.__name__} does not support "
            "auto build knowledge base instance",
        )



[docs]
    def post_processing(
        self,
        retrieved_docs: list[str],
        prompt: str,
        **kwargs: Any,
    ) -> Any:
        """
        A default solution for post-processing function, generates answer
        based on the retrieved documents.

        Args:
            retrieved_docs (`list[str]`):
                List of retrieved documents
            prompt (`str`):
                Prompt for LLM generating answer with the retrieved documents

        Returns:
            Any: A synthesized answer from LLM with retrieved documents

        Example:
            self.postprocessing_model(prompt.format(retrieved_docs))
        """
        assert self.postprocessing_model
        prompt = prompt.format("\n".join(retrieved_docs))
        return self.postprocessing_model(prompt, **kwargs).text


    def _prepare_args_from_config(self, config: dict) -> Any:
        """
        Helper function to build objects in RAG classes.

        Args:
            config (`dict`):
                A dictionary containing configurations
        Returns:
            Any: An object that is parsed/built to be an element
            of input to the function of RAG module.
        """
        if not isinstance(config, dict):
            return config

        if "create_object" in config:
            # if a term in args is an object,
            # recursively create object with args from config
            module_name = config.get("module", "")
            class_name = config.get("class", "")
            init_args = config.get("init_args", {})
            try:
                cur_module = importlib.import_module(module_name)
                cur_class = getattr(cur_module, class_name)
                init_args = self._prepare_args_from_config(init_args)
                logger.info(
                    f"load and build object: {class_name}",
                )
                return cur_class(**init_args)
            except ImportError as exc_inner:
                logger.error(
                    f"Fail to load class {class_name} "
                    f"from module {module_name}",
                )
                raise ImportError(
                    f"Fail to load class {class_name} "
                    f"from module {module_name}",
                ) from exc_inner
        else:
            prepared_args = {}
            for key, value in config.items():
                if isinstance(value, list):
                    prepared_args[key] = []
                    for c in value:
                        prepared_args[key].append(
                            self._prepare_args_from_config(c),
                        )
                elif isinstance(value, dict):
                    prepared_args[key] = self._prepare_args_from_config(value)
                else:
                    prepared_args[key] = value
            return prepared_args