Fixed-length Chunking¶

  1. Fixed Size Chunking: The fixed_size_chunk function splits the input text into chunks of specified size (in terms of words).

  2. Storing Chunks in FAISS: The store_chunks_in_faiss function uses TF-IDF to convert the chunks into vectors and stores them in a FAISS index.

  3. Querying FAISS: The query_faiss function takes a user query, converts it into a vector, and retrieves the top k relevant chunks to the user query from the FAISS index.

  4. Visualization: The visualize_chunks function uses Matplotlib to create a horizontal bar graph displaying chunks.

    • The visualize_chunks function displays all the original chunks in different colors.
    • The visualize_retrieved_chunks function highlights only the chunks retrieved based on the query, using the same color scheme.
In [ ]:
%pip install -q faiss-cpu
%pip install -q sklearn
%pip install -q matplotlib
In [ ]:
import numpy as np
import faiss
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import matplotlib.cm as cm

def fixed_size_chunk(text, chunk_size=15):
    """Splits the input text into fixed-size chunks."""
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def store_chunks_in_faiss(chunks):
    """Stores chunks in FAISS using TF-IDF vectors."""
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(chunks).toarray()
    
    # Initialize FAISS index
    dimension = X.shape[1]
    index = faiss.IndexFlatL2(dimension)  # Using L2 distance for similarity
    index.add(np.array(X).astype('float32'))

    return index, vectorizer

def query_faiss(index, vectorizer, query, k=3):
    """Retrieves the top k relevant chunks for the query."""
    query_vector = vectorizer.transform([query]).toarray().astype('float32')
    distances, indices = index.search(query_vector, k)
    return indices[0]

def visualize_chunks(chunks, retrieved_indices):
    """Visualizes the original chunked data with gradient colors."""
    num_chunks = len(chunks)
    colors = cm.viridis(np.linspace(0, 1, num_chunks))  # Generate gradient colors
    
    plt.figure(figsize=(12, 6))
    
    for i, chunk in enumerate(chunks):
        plt.barh(i, 1, color=colors[i])
        plt.text(0.5, i, chunk, va='center', ha='center', fontsize=9, color='white')

    plt.title("Original Chunked Data Visualization")
    plt.xlabel("Chunks")
    plt.yticks(range(num_chunks), [f"Chunk {i + 1}" for i in range(num_chunks)])
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.show()

def visualize_retrieved_chunks(chunks, retrieved_indices):
    """Visualizes only the retrieved chunks with gradient colors."""
    num_chunks = len(chunks)
    colors = cm.viridis(np.linspace(0, 1, num_chunks))  # Same gradient colors for consistency
    
    plt.figure(figsize=(12, 6))
    
    for i in retrieved_indices:
        plt.barh(i, 1, color=colors[i])
        plt.text(0.5, i, chunks[i], va='center', ha='center', fontsize=9, color='white')

    plt.title("Retrieved Chunks Visualization")
    plt.xlabel("Chunks")
    plt.yticks(retrieved_indices, [f"Chunk {i + 1}" for i in retrieved_indices])
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.show()

def visualize_both(chunks, retrieved_indices):
    """Displays both original and retrieved chunk visualizations side by side."""
    num_chunks = len(chunks)
    #colors = cm.viridis(np.linspace(0, 1, num_chunks))  # Generate gradient colors
    colors = cm.autumn(np.linspace(0, 1, num_chunks))  # Generate gradient colors
    
    fig, axs = plt.subplots(1, 2, figsize=(14, 6))

    # Original Chunked Data
    for i, chunk in enumerate(chunks):
        axs[0].barh(i, 1, color=colors[i])
        axs[0].text(0.5, i, chunk, va='center', ha='center',  fontsize=9, color='black')
    axs[0].set_title("Original Chunked Data Visualization")
    axs[0].set_xlabel("Chunks")
    axs[0].set_yticks(range(num_chunks))
    axs[0].set_yticklabels([f"Chunk {i + 1}" for i in range(num_chunks)])
    axs[0].grid(axis='x', linestyle='--', alpha=0.7)

    # Retrieved Chunks
    for i in retrieved_indices:
        axs[1].barh(i, 1, color=colors[i])
        axs[1].text(0.5, i, chunks[i], va='center', ha='center',  fontsize=9,   color='black')
    axs[1].set_title("Retrieved Chunks Visualization")
    axs[1].set_xlabel("Chunks")
    axs[1].set_yticks(retrieved_indices)
    axs[1].set_yticklabels([f"Chunk {i + 1}" for i in retrieved_indices])
    axs[1].grid(axis='x', linestyle='--', alpha=0.7)

    plt.tight_layout()
    plt.show()
In [ ]:
# Example Usage
if __name__ == "__main__":
    # Step 1: Create a sample text
    text = ("This is an example of a long piece of text that will be chunked into smaller segments. "
            "These segments are designed to facilitate better retrieval and processing. "
            "Each chunk should maintain some contextual information for effective querying.")

    # Step 2: Chunk the text
    chunks = fixed_size_chunk(text, chunk_size=10)
    
    # Step 3: Store chunks in FAISS
    index, vectorizer = store_chunks_in_faiss(chunks)
    
    # Step 4: Run a query
    user_query = "smaller segments for better retrieval"
    retrieved_indices = query_faiss(index, vectorizer, user_query)
    
    # Step 5: Visualize both original and retrieved chunks
    visualize_both(chunks, retrieved_indices)
No description has been provided for this image