Coverage for tests/test_parser.py: 100%

42 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-20 21:23 +0000

1import sys 

2import os 

3import pytest 

4 

5# Add the project root to Python's module path 

6sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 

7 

8# Getting the parse function from the py files 

9from app.doc_parsers.parse_pdf import parse_pdf 

10from app.doc_parsers.parse_txt import parse_txt 

11 

12 

13def test_parse_pdf(): 

14 """ 

15 

16 test the parse_pdf 

17 

18 """ 

19 FILE_PATH = "./tests/test_data/Dna.pdf" 

20 

21 # Call the function you want to test 

22 result = parse_pdf(FILE_PATH) 

23 

24 # Example test assertion - check if result is not empty 

25 assert len(result) > 0, "No PDFs were parsed." 

26 

27 # Example test assertion - check if result is a list 

28 assert isinstance(result, list), "Result is not a list." 

29 

30 

31def test_parse_txt(): 

32 """ 

33 

34 test the parse_txt 

35 

36 """ 

37 

38 FILE_PATH = "./tests/test_data/Dna.txt" 

39 

40 # Call the function you want to test 

41 result = parse_txt(FILE_PATH) 

42 

43 # Example test assertion - check if result is not empty 

44 assert len(result) > 0, "No txts were parsed." 

45 

46 # Example test assertion - check if result is a list 

47 assert isinstance(result, list), "Result is not a list." 

48 

49 

50 

51# Testing for Formatted Pdfloader 

52import fitz # PyMuPDF 

53def test_formatted_pdf(file_path="./tests/test_data/formatted_test_pdf.pdf"): 

54 doc = fitz.open(file_path) 

55 spans = [] 

56 

57 for page in doc: 

58 text_dict = page.get_text("dict") 

59 for block in text_dict["blocks"]: 

60 if block["type"] == 0: 

61 for line in block["lines"]: 

62 for span in line["spans"]: 

63 text = span["text"].strip() 

64 if text: 

65 spans.append({ 

66 "text": text, 

67 "font": span["font"], 

68 "size": span["size"], 

69 "bold": "Bold" in span["font"] 

70 }) 

71 

72 # --- Check Title --- 

73 title = next((s for s in spans if s["text"] == "Sample Formatted PDF"), None) 

74 assert title is not None, "Title text not found" 

75 assert title["bold"], "Title is not bold" 

76 assert title["size"] >= 16.0, "Title font size is too small" 

77 

78 # --- Check Bullet Point --- 

79 bullet = next((s for s in spans if s["text"] == "- First bullet point"), None) 

80 assert bullet is not None, "Bullet point not found" 

81 assert not bullet["bold"], "Bullet point should not be bold" 

82 assert bullet["size"] == 12.0, "Bullet point font size incorrect" 

83 

84 # --- Check Table Header --- 

85 header = next((s for s in spans if s["text"] == "Name"), None) 

86 assert header is not None, "Table header 'Name' not found" 

87 assert header["bold"], "'Name' should be bold" 

88 assert header["size"] == 12.0, "Header font size incorrect" 

89 

90 print("All formatting checks passed!") 

91 

92# Example usage: 

93# check_formatted_pdf()