Source code for agentscope.rag._reader._pdf_reader
# -*- coding: utf-8 -*-"""The PDF reader to read and chunk PDF files."""importhashlibfromtypingimportLiteralfrom._reader_baseimportReaderBasefrom._text_readerimportTextReaderfrom.._documentimportDocument
[docs]classPDFReader(ReaderBase):"""The PDF reader that splits text into chunks by a fixed chunk size."""
[docs]def__init__(self,chunk_size:int=512,split_by:Literal["char","sentence","paragraph"]="sentence",)->None:"""Initialize the text reader. Args: chunk_size (`int`, default to 512): The size of each chunk, in number of characters. split_by (`Literal["char", "sentence", "paragraph"]`, default to \ "sentence"): The unit to split the text, can be "char", "sentence", or "paragraph". The "sentence" option is implemented using the "nltk" library, which only supports English text. """ifchunk_size<=0:raiseValueError(f"The chunk_size must be positive, got {chunk_size}",)ifsplit_bynotin["char","sentence","paragraph"]:raiseValueError("The split_by must be one of 'char', 'sentence' or "f"'paragraph', got {split_by}",)self.chunk_size=chunk_sizeself.split_by=split_by# To avoid code duplication, we use TextReader to do the chunking.self._text_reader=TextReader(self.chunk_size,self.split_by,)
[docs]asyncdef__call__(self,pdf_path:str,)->list[Document]:"""Read a PDF file, split it into chunks, and return a list of Document objects. Args: pdf_path (`str`): The input PDF file path. """try:frompypdfimportPdfReaderexceptImportErrorase:raiseImportError("Please install pypdf to use the PDF reader. ""You can install it by `pip install pypdf`.",)fromereader=PdfReader(pdf_path)gather_texts=[]forpageinreader.pages:gather_texts.append(page.extract_text())doc_id=hashlib.sha256(pdf_path.encode("utf-8")).hexdigest()docs=awaitself._text_reader("\n\n".join(gather_texts))fordocindocs:doc.id=doc_idreturndocs
[docs]defget_doc_id(self,pdf_path:str)->str:"""Get the document ID. This function can be used to check if the doc_id already exists in the knowledge base."""returnhashlib.sha256(pdf_path.encode("utf-8")).hexdigest()