Coverage for tests/test_index_doc.py: 100%

33 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-20 21:23 +0000

1import sys 

2import os 

3import pytest 

4from flask import current_app 

5from app.doc_parsers.parse_pdf import parse_pdf 

6from app.doc_indexer.index_doc import index_and_add_to_db 

7from app import db 

8 

9 

10def test_index_doc(client, app, clean_vector_db): 

11 """ 

12 ✅ Tests that parsed document chunks are correctly indexed in the vector database. 

13 """ 

14 FILE_PATH = "./tests/test_data/Dna.pdf" 

15 

16 # Parse the PDF into document chunks 

17 chunks = parse_pdf(FILE_PATH) 

18 assert len(chunks) > 0, "Parsing failed, no chunks extracted" 

19 

20 with app.app_context(): 

21 # Index chunks into the vector database 

22 index_and_add_to_db(chunks) 

23 vector_db = current_app.vector_db 

24 indexed_chunks = db.session.query(vector_db.EmbeddingStore.id).all() 

25 

26 assert len(indexed_chunks) == len( 

27 chunks 

28 ), "Not all chunks were indexed successfully" 

29 

30 

31def test_duplicate_chunks_ignored(client, app, clean_vector_db): 

32 """ 

33 ❌ Ensures duplicate document chunks are not reindexed. 

34 """ 

35 FILE_PATH = "./tests/test_data/Dna.pdf" 

36 

37 chunks = parse_pdf(FILE_PATH) 

38 with app.app_context(): 

39 index_and_add_to_db(chunks) # First indexing 

40 vector_db = current_app.vector_db 

41 initial_count = db.session.query(vector_db.EmbeddingStore.id).count() 

42 index_and_add_to_db(chunks) # Attempt re-indexing 

43 final_count = db.session.query(vector_db.EmbeddingStore.id).count() 

44 

45 assert initial_count == final_count, "Duplicate chunks should not be indexed again" 

46 

47 

48def test_empty_file(client, app, clean_vector_db): 

49 """ 

50 ❌ Tests that no chunks are indexed for an empty PDF. 

51 """ 

52 

53 chunks = [] 

54 

55 with app.app_context(): 

56 index_and_add_to_db(chunks) 

57 vector_db = current_app.vector_db 

58 indexed_chunks = db.session.query(vector_db.EmbeddingStore.id).all() 

59 

60 assert len(indexed_chunks) == 0, "No chunks should be indexed for an empty file"