Coverage for tests/test_index_doc.py: 100%
33 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-20 21:23 +0000
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-20 21:23 +0000
1import sys
2import os
3import pytest
4from flask import current_app
5from app.doc_parsers.parse_pdf import parse_pdf
6from app.doc_indexer.index_doc import index_and_add_to_db
7from app import db
10def test_index_doc(client, app, clean_vector_db):
11 """
12 ✅ Tests that parsed document chunks are correctly indexed in the vector database.
13 """
14 FILE_PATH = "./tests/test_data/Dna.pdf"
16 # Parse the PDF into document chunks
17 chunks = parse_pdf(FILE_PATH)
18 assert len(chunks) > 0, "Parsing failed, no chunks extracted"
20 with app.app_context():
21 # Index chunks into the vector database
22 index_and_add_to_db(chunks)
23 vector_db = current_app.vector_db
24 indexed_chunks = db.session.query(vector_db.EmbeddingStore.id).all()
26 assert len(indexed_chunks) == len(
27 chunks
28 ), "Not all chunks were indexed successfully"
31def test_duplicate_chunks_ignored(client, app, clean_vector_db):
32 """
33 ❌ Ensures duplicate document chunks are not reindexed.
34 """
35 FILE_PATH = "./tests/test_data/Dna.pdf"
37 chunks = parse_pdf(FILE_PATH)
38 with app.app_context():
39 index_and_add_to_db(chunks) # First indexing
40 vector_db = current_app.vector_db
41 initial_count = db.session.query(vector_db.EmbeddingStore.id).count()
42 index_and_add_to_db(chunks) # Attempt re-indexing
43 final_count = db.session.query(vector_db.EmbeddingStore.id).count()
45 assert initial_count == final_count, "Duplicate chunks should not be indexed again"
48def test_empty_file(client, app, clean_vector_db):
49 """
50 ❌ Tests that no chunks are indexed for an empty PDF.
51 """
53 chunks = []
55 with app.app_context():
56 index_and_add_to_db(chunks)
57 vector_db = current_app.vector_db
58 indexed_chunks = db.session.query(vector_db.EmbeddingStore.id).all()
60 assert len(indexed_chunks) == 0, "No chunks should be indexed for an empty file"