Coverage for app/doc_parsers/parse_txt.py: 86%

14 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-20 21:23 +0000

1import os 

2import re 

3from langchain.text_splitter import RecursiveCharacterTextSplitter 

4 

5 

6 

7 

8 

9def parse_txt(FILE_PATH): 

10 """ 

11 Parse a txt files into chunks 

12 

13 Args: 

14 FILE_PATH (_type_): file path for pdf file 

15 

16 Returns: 

17 LIST (_type_): A list of chucks 

18 """ 

19 # Check if the file exists 

20 if not os.path.isfile(FILE_PATH): 

21 print(f"Error: The file '{FILE_PATH}' does not exist.") 

22 return [] 

23 

24 chunks = [] 

25 

26 with open(FILE_PATH, "r", encoding="utf-8") as file: 

27 # Read entire file and strip extra spaces 

28 content = file.read().strip() 

29 

30 # Use a text splitter to break the content into chunks 

31 text_splitter = RecursiveCharacterTextSplitter( 

32 chunk_size=1000, # Larger chunk size for more context 

33 chunk_overlap=150, # Increased overlap to retain context between chunks 

34 length_function=lambda x: len(x.split()), # Using word count instead of character count 

35 is_separator_regex=True, # Allowing regex for flexible splitting 

36 ) 

37 chunks.extend(text_splitter.split_text(content)) 

38 

39 return chunks 

40 

41 

42""" 

43def main(): 

44 chunks = parse_txt(FILE_PATH) 

45 # Output the list of sentences 

46 for idx, sentence in enumerate(chunks, 1): 

47 print(f"Sentence {idx}: {sentence}") 

48 

49if __name__ == "__main__": 

50 main() 

51"""