Fixed-length Chunking¶
Fixed Size Chunking: The fixed_size_chunk function splits the input text into chunks of specified size (in terms of words).
Storing Chunks in FAISS: The store_chunks_in_faiss function uses TF-IDF to convert the chunks into vectors and stores them in a FAISS index.
Querying FAISS: The query_faiss function takes a user query, converts it into a vector, and retrieves the top k relevant chunks to the user query from the FAISS index.
Visualization: The visualize_chunks function uses Matplotlib to create a horizontal bar graph displaying chunks.
- The visualize_chunks function displays all the original chunks in different colors.
- The visualize_retrieved_chunks function highlights only the chunks retrieved based on the query, using the same color scheme.
In [ ]:
%pip install -q faiss-cpu
%pip install -q sklearn
%pip install -q matplotlib
In [ ]:
import numpy as np
import faiss
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import matplotlib.cm as cm
def fixed_size_chunk(text, chunk_size=15):
"""Splits the input text into fixed-size chunks."""
words = text.split()
chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
return chunks
def store_chunks_in_faiss(chunks):
"""Stores chunks in FAISS using TF-IDF vectors."""
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(chunks).toarray()
# Initialize FAISS index
dimension = X.shape[1]
index = faiss.IndexFlatL2(dimension) # Using L2 distance for similarity
index.add(np.array(X).astype('float32'))
return index, vectorizer
def query_faiss(index, vectorizer, query, k=3):
"""Retrieves the top k relevant chunks for the query."""
query_vector = vectorizer.transform([query]).toarray().astype('float32')
distances, indices = index.search(query_vector, k)
return indices[0]
def visualize_chunks(chunks, retrieved_indices):
"""Visualizes the original chunked data with gradient colors."""
num_chunks = len(chunks)
colors = cm.viridis(np.linspace(0, 1, num_chunks)) # Generate gradient colors
plt.figure(figsize=(12, 6))
for i, chunk in enumerate(chunks):
plt.barh(i, 1, color=colors[i])
plt.text(0.5, i, chunk, va='center', ha='center', fontsize=9, color='white')
plt.title("Original Chunked Data Visualization")
plt.xlabel("Chunks")
plt.yticks(range(num_chunks), [f"Chunk {i + 1}" for i in range(num_chunks)])
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()
def visualize_retrieved_chunks(chunks, retrieved_indices):
"""Visualizes only the retrieved chunks with gradient colors."""
num_chunks = len(chunks)
colors = cm.viridis(np.linspace(0, 1, num_chunks)) # Same gradient colors for consistency
plt.figure(figsize=(12, 6))
for i in retrieved_indices:
plt.barh(i, 1, color=colors[i])
plt.text(0.5, i, chunks[i], va='center', ha='center', fontsize=9, color='white')
plt.title("Retrieved Chunks Visualization")
plt.xlabel("Chunks")
plt.yticks(retrieved_indices, [f"Chunk {i + 1}" for i in retrieved_indices])
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()
def visualize_both(chunks, retrieved_indices):
"""Displays both original and retrieved chunk visualizations side by side."""
num_chunks = len(chunks)
#colors = cm.viridis(np.linspace(0, 1, num_chunks)) # Generate gradient colors
colors = cm.autumn(np.linspace(0, 1, num_chunks)) # Generate gradient colors
fig, axs = plt.subplots(1, 2, figsize=(14, 6))
# Original Chunked Data
for i, chunk in enumerate(chunks):
axs[0].barh(i, 1, color=colors[i])
axs[0].text(0.5, i, chunk, va='center', ha='center', fontsize=9, color='black')
axs[0].set_title("Original Chunked Data Visualization")
axs[0].set_xlabel("Chunks")
axs[0].set_yticks(range(num_chunks))
axs[0].set_yticklabels([f"Chunk {i + 1}" for i in range(num_chunks)])
axs[0].grid(axis='x', linestyle='--', alpha=0.7)
# Retrieved Chunks
for i in retrieved_indices:
axs[1].barh(i, 1, color=colors[i])
axs[1].text(0.5, i, chunks[i], va='center', ha='center', fontsize=9, color='black')
axs[1].set_title("Retrieved Chunks Visualization")
axs[1].set_xlabel("Chunks")
axs[1].set_yticks(retrieved_indices)
axs[1].set_yticklabels([f"Chunk {i + 1}" for i in retrieved_indices])
axs[1].grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
In [ ]:
# Example Usage
if __name__ == "__main__":
# Step 1: Create a sample text
text = ("This is an example of a long piece of text that will be chunked into smaller segments. "
"These segments are designed to facilitate better retrieval and processing. "
"Each chunk should maintain some contextual information for effective querying.")
# Step 2: Chunk the text
chunks = fixed_size_chunk(text, chunk_size=10)
# Step 3: Store chunks in FAISS
index, vectorizer = store_chunks_in_faiss(chunks)
# Step 4: Run a query
user_query = "smaller segments for better retrieval"
retrieved_indices = query_faiss(index, vectorizer, user_query)
# Step 5: Visualize both original and retrieved chunks
visualize_both(chunks, retrieved_indices)