Coverage for app/doc_parsers/process

1import os

4# Getting the parse function from the py files

5from app.doc_parsers.parse_pdf import parse_pdf

6from app.doc_parsers.parse_txt import parse_txt

8from app.doc_indexer.index_doc import index_and_add_to_db

9from app.models import Document

13def process_doc(new_document: Document):

14 """

15 Args:new_document: (Document): document that is going be sorted

17 create a temp file of that document and parse and index the contexts to the vector db

19 """

21 if new_document:

23 # Ensure the 'data' folder exists

24 data_folder = './app/data'

25 if not os.path.exists(data_folder):

26 os.makedirs(data_folder)

28 # Make a temp filename and filepath

29 file_name = f"{new_document.document_name}.{new_document.document_type}"

30 file_path = os.path.join(data_folder, file_name)

33 # Save the file contents into the file_path (new_document.file_contents contains the binary data)

34 with open(file_path, 'wb') as f:

35 f.write(new_document.file_contents)

38 # Extract the file extension (using the document's type)

39 ext = new_document.document_type.lower() # Ensure extension is lowercase

42 if ext == "pdf":

44 # Pass the file path to parsing and indexing

45 chunks = parse_pdf(file_path)

46 index_and_add_to_db(chunks)

48 elif ext == "txt":

49 # Pass the file path to parsing and indexing

50 chunks = parse_txt(file_path)

51 index_and_add_to_db(chunks)

54 else:

55 print(f"Cannont process this document {new_document.document_name}")

58 # Delete the file after processing

59 if os.path.exists(file_path):

60 os.remove(file_path)

61 print(f"File {file_name} has processed")

62 print("-" * 30)

65 else:

66 print("No document to process")

Coverage for app/doc_parsers/process_doc.py: 78%