Coverage for app/doc_indexer/index_doc.py: 100%

28 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-20 21:23 +0000

1from langchain.schema.document import Document 

2from app import db 

3from flask import current_app 

4 

5 

6def index_and_add_to_db(chunks: list[Document]): 

7 """ 

8 Index and add new document chunks to the vector database. 

9  

10 This function performs the following tasks: 

11 1. Calculates a unique ID for each chunk based on its source, page number, and chunk index. 

12 2. Queries the vector database for existing document IDs. 

13 3. Filters out chunks that are already in the database. 

14 4. Adds only new chunks to the vector database. 

15  

16 Args: 

17 chunks (list[Document]): A list of Document objects that need to be indexed. 

18 """ 

19 # Calculate unique IDs for each chunk 

20 chunks_with_ids = calculate_chunk_ids(chunks) 

21 

22 # Access the vector database from the current Flask application context. 

23 vector_db = current_app.vector_db 

24 

25 # Query the vector database to get a set of all existing document IDs. 

26 existing_ids = set( 

27 [ 

28 id_tuple[0] 

29 for id_tuple in db.session.query(vector_db.EmbeddingStore.id).all() 

30 ] 

31 ) 

32 

33 # Create a list to hold chunks that are not already in the database. 

34 new_chunks = [] 

35 

36 # Iterate over each chunk with an assigned ID. 

37 for chunk in chunks_with_ids: 

38 # Check if the chunk's ID (from metadata) is not already present in the vector database. 

39 if chunk.metadata["id"] not in existing_ids: 

40 new_chunks.append(chunk) 

41 

42 # If there are any new chunks, add them to the vector database. 

43 if len(new_chunks): 

44 # Collect the IDs of the new chunks for reference during insertion. 

45 new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks] 

46 # Add the new document chunks to the vector database. 

47 vector_db.add_documents(new_chunks, ids=new_chunk_ids) 

48 

49 

50def calculate_chunk_ids(chunks): 

51 """ 

52 Calculate and assign unique IDs to each document chunk. 

53  

54 Each chunk ID is composed of: 

55 - The source of the document (e.g., file path). 

56 - The page number. 

57 - The index of the chunk on that page. 

58  

59 For example, an ID could look like "data/monopoly.pdf:6:2" which means: 

60 - Source: data/monopoly.pdf 

61 - Page: 6 

62 - Chunk Index: 2 

63  

64 The function updates each chunk's metadata with the calculated ID. 

65  

66 Args: 

67 chunks (list[Document]): A list of Document objects to process. 

68  

69 Returns: 

70 list[Document]: The list of Document objects with updated metadata including the 'id' key. 

71 """ 

72 # Initialize tracking variables for the last processed page and the chunk index on that page. 

73 last_page_id = None 

74 current_chunk_index = 0 

75 

76 # Loop through each document chunk in the list. 

77 for chunk in chunks: 

78 # Extract the 'source' (e.g., file name or path) and 'page' number from the chunk metadata. 

79 source = chunk.metadata.get("source") 

80 page = chunk.metadata.get("page") 

81 

82 # Form the current page identifier by combining the source and page number. 

83 current_page_id = f"{source}:{page}" 

84 

85 # If this chunk is from the same page as the previous one, increment the chunk index. 

86 if current_page_id == last_page_id: 

87 current_chunk_index += 1 

88 else: 

89 # Otherwise, reset the chunk index for a new page. 

90 current_chunk_index = 0 

91 

92 # Construct the unique chunk ID using the page identifier and the current chunk index. 

93 chunk_id = f"{current_page_id}:{current_chunk_index}" 

94 

95 # Update the last_page_id tracker. 

96 last_page_id = current_page_id 

97 

98 # Save the generated chunk ID in the chunk's metadata. 

99 chunk.metadata["id"] = chunk_id 

100 

101 # Return the updated list of chunks. 

102 return chunks