Coverage for app/doc_parsers/parse_txt.py: 86%
14 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-20 21:23 +0000
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-20 21:23 +0000
1import os
2import re
3from langchain.text_splitter import RecursiveCharacterTextSplitter
9def parse_txt(FILE_PATH):
10 """
11 Parse a txt files into chunks
13 Args:
14 FILE_PATH (_type_): file path for pdf file
16 Returns:
17 LIST (_type_): A list of chucks
18 """
19 # Check if the file exists
20 if not os.path.isfile(FILE_PATH):
21 print(f"Error: The file '{FILE_PATH}' does not exist.")
22 return []
24 chunks = []
26 with open(FILE_PATH, "r", encoding="utf-8") as file:
27 # Read entire file and strip extra spaces
28 content = file.read().strip()
30 # Use a text splitter to break the content into chunks
31 text_splitter = RecursiveCharacterTextSplitter(
32 chunk_size=1000, # Larger chunk size for more context
33 chunk_overlap=150, # Increased overlap to retain context between chunks
34 length_function=lambda x: len(x.split()), # Using word count instead of character count
35 is_separator_regex=True, # Allowing regex for flexible splitting
36 )
37 chunks.extend(text_splitter.split_text(content))
39 return chunks
42"""
43def main():
44 chunks = parse_txt(FILE_PATH)
45 # Output the list of sentences
46 for idx, sentence in enumerate(chunks, 1):
47 print(f"Sentence {idx}: {sentence}")
49if __name__ == "__main__":
50 main()
51"""