Coverage for app/doc_parsers/process_doc.py: 78%

27 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-20 21:23 +0000

1import os 

2 

3 

4# Getting the parse function from the py files 

5from app.doc_parsers.parse_pdf import parse_pdf 

6from app.doc_parsers.parse_txt import parse_txt 

7 

8from app.doc_indexer.index_doc import index_and_add_to_db 

9from app.models import Document 

10 

11 

12 

13def process_doc(new_document: Document): 

14 """ 

15 Args:new_document: (Document): document that is going be sorted  

16 

17 create a temp file of that document and parse and index the contexts to the vector db 

18 

19 """ 

20 

21 if new_document: 

22 

23 # Ensure the 'data' folder exists 

24 data_folder = './app/data' 

25 if not os.path.exists(data_folder): 

26 os.makedirs(data_folder) 

27 

28 # Make a temp filename and filepath  

29 file_name = f"{new_document.document_name}.{new_document.document_type}" 

30 file_path = os.path.join(data_folder, file_name) 

31 

32 

33 # Save the file contents into the file_path (new_document.file_contents contains the binary data) 

34 with open(file_path, 'wb') as f: 

35 f.write(new_document.file_contents) 

36 

37 

38 # Extract the file extension (using the document's type) 

39 ext = new_document.document_type.lower() # Ensure extension is lowercase 

40 

41 

42 if ext == "pdf": 

43 

44 # Pass the file path to parsing and indexing 

45 chunks = parse_pdf(file_path) 

46 index_and_add_to_db(chunks) 

47 

48 elif ext == "txt": 

49 # Pass the file path to parsing and indexing 

50 chunks = parse_txt(file_path) 

51 index_and_add_to_db(chunks) 

52 

53 

54 else: 

55 print(f"Cannont process this document {new_document.document_name}") 

56 

57 

58 # Delete the file after processing 

59 if os.path.exists(file_path): 

60 os.remove(file_path) 

61 print(f"File {file_name} has processed") 

62 print("-" * 30) 

63 

64 

65 else: 

66 print("No document to process") 

67 

68 

69 

70 

71 

72