Coverage for app/doc_parsers/parse_pdf.py: 100%
24 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-20 21:23 +0000
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-20 21:23 +0000
1from langchain_community.document_loaders import PyPDFLoader
2from langchain_text_splitters import RecursiveCharacterTextSplitter
3from langchain.schema.document import Document
4import os
5import fitz # PyMuPDF
8DATA_PATH = os.path.abspath("./tests/test_data/Dna.pdf")
11# Parse any file in pdf
12def parse_pdf(FILE_PATH):
13 """
14 Parse a pdf files into chunks
16 Args:
17 FILE_PATH (_type_): file path for pdf file
19 Returns:
20 LIST (_type_): A list of chucks
21 """
22 documents = load_documents(FILE_PATH)
23 chunks = split_documents(documents)
24 for chunk in chunks:
25 chunk.page_content = chunk.page_content.replace("\x00", "")
26 return chunks
29# load the pdf files
30def load_documents(FILE_PATH):
31 """
32 A function to load the pdf documnet.
34 Args:
35 FILE_PATH (_type_): file path for pdf file
37 Returns:
38 LIST (_type_): List of Document object for each page
39 """
40 doc = fitz.open(FILE_PATH)
41 documents = []
42 for i, page in enumerate(doc):
43 text = page.get_text("text")
44 if text.strip():
45 documents.append(
46 Document(
47 page_content=text,
48 metadata={
49 "page_number": i + 1,
50 "source": FILE_PATH
51 },
52 )
53 )
54 return documents
57# Split the pdf pages into chuck for easier use and clearness
58def split_documents(documents: list[Document]):
59 """
60 Split the pdf pages into chuck for easier use and clearness
62 Args:
63 documents (list[Document]): List of Pages
65 Returns:
66 LIST (_type_): List of chunks object for pdf file
67 """
68 text_splitter = RecursiveCharacterTextSplitter(
69 chunk_size=450, # Larger chunk size for more context
70 chunk_overlap=150, # Increased overlap to retain context between chunks
71 length_function=lambda x: len(
72 x.split()
73 ), # Using word count instead of character count
74 is_separator_regex=True, # Allowing regex for flexible splitting
75 )
76 return text_splitter.split_documents(documents)
79"""
80def main():
81 chunks = parse_pdf(FILE_PATH)
82 print(chunks[0])
84if __name__ == "__main__":
85 main()
86"""