Source code for agentscope.rag._reader._pdf_reader

# -*- coding: utf-8 -*-
"""The PDF reader to read and chunk PDF files."""
import hashlib
from typing import Literal

from ._reader_base import ReaderBase
from ._text_reader import TextReader
from .._document import Document



[docs]
class PDFReader(ReaderBase):
    """The PDF reader that splits text into chunks by a fixed chunk size."""


[docs]
    def __init__(
        self,
        chunk_size: int = 512,
        split_by: Literal["char", "sentence", "paragraph"] = "sentence",
    ) -> None:
        """Initialize the text reader.

        Args:
            chunk_size (`int`, default to 512):
                The size of each chunk, in number of characters.
            split_by (`Literal["char", "sentence", "paragraph"]`, default to \
            "sentence"):
                The unit to split the text, can be "char", "sentence", or
                "paragraph". The "sentence" option is implemented using the
                "nltk" library, which only supports English text.
        """
        if chunk_size <= 0:
            raise ValueError(
                f"The chunk_size must be positive, got {chunk_size}",
            )

        if split_by not in ["char", "sentence", "paragraph"]:
            raise ValueError(
                "The split_by must be one of 'char', 'sentence' or "
                f"'paragraph', got {split_by}",
            )

        self.chunk_size = chunk_size
        self.split_by = split_by

        # To avoid code duplication, we use TextReader to do the chunking.
        self._text_reader = TextReader(
            self.chunk_size,
            self.split_by,
        )



[docs]
    async def __call__(
        self,
        pdf_path: str,
    ) -> list[Document]:
        """Read a PDF file, split it into chunks, and return a list of
        Document objects.

        Args:
            pdf_path (`str`):
                The input PDF file path.
        """
        try:
            from pypdf import PdfReader
        except ImportError as e:
            raise ImportError(
                "Please install pypdf to use the PDF reader. "
                "You can install it by `pip install pypdf`.",
            ) from e

        reader = PdfReader(pdf_path)

        gather_texts = []
        for page in reader.pages:
            gather_texts.append(page.extract_text())

        doc_id = hashlib.sha256(pdf_path.encode("utf-8")).hexdigest()

        docs = await self._text_reader("\n\n".join(gather_texts))
        for doc in docs:
            doc.id = doc_id

        return docs



[docs]
    def get_doc_id(self, pdf_path: str) -> str:
        """Get the document ID. This function can be used to check if the
        doc_id already exists in the knowledge base."""
        return hashlib.sha256(pdf_path.encode("utf-8")).hexdigest()