Sliding Window Chunking¶

In [ ]:
%pip install -q faiss-cpu sentence-transformers matplotlib scikit-learn
In [ ]:
import numpy as np
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import textwrap
from sklearn.preprocessing import normalize

# Real-world Healthcare Text Data (Patient Monitoring)
patient_data = [
    "Patient visited the clinic for a routine check-up. Blood pressure is 130/85, slightly high.",
    "Patient had a surgery for knee replacement. Post-surgery recovery is on track, and pain levels are manageable.",
    "Patient diagnosed with type 2 diabetes. Current treatment includes metformin, and blood sugar levels are being monitored.",
    "Patient complained of frequent headaches. CT scan suggests early stages of migraine, prescribed medication for pain relief.",
    "Patient had a follow-up consultation. Blood sugar levels have improved slightly, and weight reduction was observed.",
    "Patient underwent a full-body checkup. Heart rate is normal, and no signs of cardiovascular issues were detected.",
    "Patient was hospitalized due to an acute allergic reaction. Allergy tests were conducted, and antihistamines were prescribed.",
    "Patient received the first dose of COVID-19 vaccine. Mild symptoms like fever and body aches were reported post-vaccination.",
    "Patient is under observation for lung cancer treatment. Chemotherapy sessions are ongoing, and symptoms are being managed.",
    "Patient had a routine eye exam. No significant changes in vision, but mild signs of cataracts were noted."
]

# Sliding Window Chunking Function
def sliding_window_chunking(texts, window_size=2, step_size=1):
    chunks = []
    for i in range(0, len(texts) - window_size + 1, step_size):
        chunk = " ".join(texts[i:i + window_size])
        chunks.append(chunk)
    return chunks

# Text Embedding Using Sentence Transformers
def embed_text(texts):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(texts)
    return embeddings

# Normalize Embeddings and Create FAISS Index
def create_faiss_index(embeddings):
    embeddings = normalize(embeddings, axis=1)  # Normalize to unit vectors
    dim = embeddings.shape[1]  # Dimensionality of embeddings
    index = faiss.IndexFlatL2(dim)  # Use L2 distance for similarity
    index.add(embeddings)
    return index

# Search Function (RAG)
def search_query(query, index, model, top_k=3):
    query_embedding = model.encode([query])
    query_embedding = normalize(query_embedding, axis=1)
    _, indices = index.search(query_embedding, top_k)
    return indices[0]

# Visualization (Chunked Data & Search Results)
def plot_chunked_data(chunks):
    fig, ax = plt.subplots(figsize=(10, 8))  # Increase figure size for better text placement
    ax.barh(range(len(chunks)), [len(chunk.split()) for chunk in chunks], color='skyblue')
    ax.set_title("Original Chunked Data Indices")
    ax.set_xlabel("Number of Words per Chunk")
    ax.set_ylabel("Chunk Index")
    plt.yticks(range(len(chunks)), [f"Chunk {i}" for i in range(len(chunks))], fontsize=8)

    # Wrap and place text within each bar
    max_chars_per_line = 100  # Limit the number of characters per line in the bar
    for i, chunk in enumerate(chunks):
        # Wrap text to fit inside the bar
        wrapped_text = textwrap.fill(chunk, width=max_chars_per_line)
        ax.text(0, i, wrapped_text, ha='left', va='center', fontsize=9, color='black', wrap=True)

    plt.tight_layout()
    plt.show()

def display_retrieved_results(chunks, retrieved_indices, index):
    # Create a list to hold the data for the table
    data = []
    
    for idx in retrieved_indices:
        retrieved_chunk = chunks[idx]
        dist = index.reconstruct(int(idx)).tolist()[0]  # Get the distance as a list
        data.append([idx, dist, retrieved_chunk])
    
    # Create a DataFrame from the data
    df = pd.DataFrame(data, columns=["Index", "Distance", "Chunk"])

    # Function to wrap text to a maximum line length of 100 characters
    def wrap_text(text, width=100):
        return "<br>".join(textwrap.wrap(text, width=width))
    
    # Apply the wrapping function to each cell in the DataFrame
    wrapped_df = df.applymap(lambda x: wrap_text(str(x)))

    # Display the table with custom styling: Bold headers, borders, and text wrapping in each cell
    styled_df = wrapped_df.style.set_table_styles(
        [{'selector': 'th', 'props': [('font-weight', 'bold'), ('border', '1px solid black')]},  # Bold headers and borders
         {'selector': 'td', 'props': [('border', '1px solid black'), ('white-space', 'normal')]},  # Table cell borders and text wrapping
         {'selector': 'table', 'props': [('border-collapse', 'collapse')]}]  # Ensure borders collapse
    )

    # Display the styled DataFrame
    print("\nRetrieved Chunks:")
    display(styled_df)

# Putting it all together
if __name__ == "__main__":
    # Sliding Window Chunking
    chunks = sliding_window_chunking(patient_data, window_size=2, step_size=1)
    
    # Embedding Chunks
    embeddings = embed_text(chunks)
    
    # FAISS Index Creation
    index = create_faiss_index(embeddings)
    
    # Plot Original Chunked Data
    plot_chunked_data(chunks)

    # User Query
    query = "What are the recent updates on the patient's condition regarding diabetes?"
    
    # Search Query in FAISS Index (RAG)
    retrieved_indices = search_query(query, index, SentenceTransformer('all-MiniLM-L6-v2'), top_k=3)
    
    # Display User Query (Simple Text)
    print(f"\nUser Query: {query}")
    
    # Display Retrieved Results
    display_retrieved_results(chunks, retrieved_indices, index)
No description has been provided for this image
User Query: What are the recent updates on the patient's condition regarding diabetes?

Retrieved Chunks:
  Index Distance Chunk
0 4 0.011669745668768883 Patient had a follow-up consultation. Blood sugar levels have improved slightly, and weight
reduction was observed. Patient underwent a full-body checkup. Heart rate is normal, and no signs of
cardiovascular issues were detected.
1 1 -0.04715641960501671 Patient had a surgery for knee replacement. Post-surgery recovery is on track, and pain levels are
manageable. Patient diagnosed with type 2 diabetes. Current treatment includes metformin, and blood
sugar levels are being monitored.
2 2 -0.02967824786901474 Patient diagnosed with type 2 diabetes. Current treatment includes metformin, and blood sugar levels
are being monitored. Patient complained of frequent headaches. CT scan suggests early stages of
migraine, prescribed medication for pain relief.