Coverage for app/doc_parsers/parse

1import os

2import re

3from langchain.text_splitter import RecursiveCharacterTextSplitter

9def parse_txt(FILE_PATH):

10 """

11 Parse a txt files into chunks

13 Args:

14 FILE_PATH (_type_): file path for pdf file

16 Returns:

17 LIST (_type_): A list of chucks

18 """

19 # Check if the file exists

20 if not os.path.isfile(FILE_PATH):

21 print(f"Error: The file '{FILE_PATH}' does not exist.")

22 return []

24 chunks = []

26 with open(FILE_PATH, "r", encoding="utf-8") as file:

27 # Read entire file and strip extra spaces

28 content = file.read().strip()

30 # Use a text splitter to break the content into chunks

31 text_splitter = RecursiveCharacterTextSplitter(

32 chunk_size=1000, # Larger chunk size for more context

33 chunk_overlap=150, # Increased overlap to retain context between chunks

34 length_function=lambda x: len(x.split()), # Using word count instead of character count

35 is_separator_regex=True, # Allowing regex for flexible splitting

37 chunks.extend(text_splitter.split_text(content))

39 return chunks

42"""

43def main():

44 chunks = parse_txt(FILE_PATH)

45 # Output the list of sentences

46 for idx, sentence in enumerate(chunks, 1):

47 print(f"Sentence {idx}: {sentence}")

49if __name__ == "__main__":

50 main()

51"""

Coverage for app/doc_parsers/parse_txt.py: 86%