Coverage for app/doc_parsers/parse_pdf.py: 100%

24 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-20 21:23 +0000

1from langchain_community.document_loaders import PyPDFLoader 

2from langchain_text_splitters import RecursiveCharacterTextSplitter 

3from langchain.schema.document import Document 

4import os 

5import fitz # PyMuPDF 

6 

7 

8DATA_PATH = os.path.abspath("./tests/test_data/Dna.pdf") 

9 

10 

11# Parse any file in pdf 

12def parse_pdf(FILE_PATH): 

13 """ 

14 Parse a pdf files into chunks 

15 

16 Args: 

17 FILE_PATH (_type_): file path for pdf file 

18 

19 Returns: 

20 LIST (_type_): A list of chucks 

21 """ 

22 documents = load_documents(FILE_PATH) 

23 chunks = split_documents(documents) 

24 for chunk in chunks: 

25 chunk.page_content = chunk.page_content.replace("\x00", "") 

26 return chunks 

27 

28 

29# load the pdf files 

30def load_documents(FILE_PATH): 

31 """ 

32 A function to load the pdf documnet. 

33 

34 Args: 

35 FILE_PATH (_type_): file path for pdf file 

36 

37 Returns: 

38 LIST (_type_): List of Document object for each page 

39 """ 

40 doc = fitz.open(FILE_PATH) 

41 documents = [] 

42 for i, page in enumerate(doc): 

43 text = page.get_text("text") 

44 if text.strip(): 

45 documents.append( 

46 Document( 

47 page_content=text, 

48 metadata={ 

49 "page_number": i + 1, 

50 "source": FILE_PATH 

51 }, 

52 ) 

53 ) 

54 return documents 

55 

56 

57# Split the pdf pages into chuck for easier use and clearness 

58def split_documents(documents: list[Document]): 

59 """ 

60 Split the pdf pages into chuck for easier use and clearness 

61 

62 Args: 

63 documents (list[Document]): List of Pages 

64 

65 Returns: 

66 LIST (_type_): List of chunks object for pdf file 

67 """ 

68 text_splitter = RecursiveCharacterTextSplitter( 

69 chunk_size=450, # Larger chunk size for more context 

70 chunk_overlap=150, # Increased overlap to retain context between chunks 

71 length_function=lambda x: len( 

72 x.split() 

73 ), # Using word count instead of character count 

74 is_separator_regex=True, # Allowing regex for flexible splitting 

75 ) 

76 return text_splitter.split_documents(documents) 

77 

78 

79""" 

80def main(): 

81 chunks = parse_pdf(FILE_PATH) 

82 print(chunks[0]) 

83 

84if __name__ == "__main__": 

85 main() 

86"""