Coverage for app/doc_parsers/process_doc.py: 78%
27 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-20 21:23 +0000
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-20 21:23 +0000
1import os
4# Getting the parse function from the py files
5from app.doc_parsers.parse_pdf import parse_pdf
6from app.doc_parsers.parse_txt import parse_txt
8from app.doc_indexer.index_doc import index_and_add_to_db
9from app.models import Document
13def process_doc(new_document: Document):
14 """
15 Args:new_document: (Document): document that is going be sorted
17 create a temp file of that document and parse and index the contexts to the vector db
19 """
21 if new_document:
23 # Ensure the 'data' folder exists
24 data_folder = './app/data'
25 if not os.path.exists(data_folder):
26 os.makedirs(data_folder)
28 # Make a temp filename and filepath
29 file_name = f"{new_document.document_name}.{new_document.document_type}"
30 file_path = os.path.join(data_folder, file_name)
33 # Save the file contents into the file_path (new_document.file_contents contains the binary data)
34 with open(file_path, 'wb') as f:
35 f.write(new_document.file_contents)
38 # Extract the file extension (using the document's type)
39 ext = new_document.document_type.lower() # Ensure extension is lowercase
42 if ext == "pdf":
44 # Pass the file path to parsing and indexing
45 chunks = parse_pdf(file_path)
46 index_and_add_to_db(chunks)
48 elif ext == "txt":
49 # Pass the file path to parsing and indexing
50 chunks = parse_txt(file_path)
51 index_and_add_to_db(chunks)
54 else:
55 print(f"Cannont process this document {new_document.document_name}")
58 # Delete the file after processing
59 if os.path.exists(file_path):
60 os.remove(file_path)
61 print(f"File {file_name} has processed")
62 print("-" * 30)
65 else:
66 print("No document to process")