Sliding Window Chunking¶
In [ ]:
%pip install -q faiss-cpu sentence-transformers matplotlib scikit-learn
In [ ]:
import numpy as np
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import textwrap
from sklearn.preprocessing import normalize
# Real-world Healthcare Text Data (Patient Monitoring)
patient_data = [
"Patient visited the clinic for a routine check-up. Blood pressure is 130/85, slightly high.",
"Patient had a surgery for knee replacement. Post-surgery recovery is on track, and pain levels are manageable.",
"Patient diagnosed with type 2 diabetes. Current treatment includes metformin, and blood sugar levels are being monitored.",
"Patient complained of frequent headaches. CT scan suggests early stages of migraine, prescribed medication for pain relief.",
"Patient had a follow-up consultation. Blood sugar levels have improved slightly, and weight reduction was observed.",
"Patient underwent a full-body checkup. Heart rate is normal, and no signs of cardiovascular issues were detected.",
"Patient was hospitalized due to an acute allergic reaction. Allergy tests were conducted, and antihistamines were prescribed.",
"Patient received the first dose of COVID-19 vaccine. Mild symptoms like fever and body aches were reported post-vaccination.",
"Patient is under observation for lung cancer treatment. Chemotherapy sessions are ongoing, and symptoms are being managed.",
"Patient had a routine eye exam. No significant changes in vision, but mild signs of cataracts were noted."
]
# Sliding Window Chunking Function
def sliding_window_chunking(texts, window_size=2, step_size=1):
chunks = []
for i in range(0, len(texts) - window_size + 1, step_size):
chunk = " ".join(texts[i:i + window_size])
chunks.append(chunk)
return chunks
# Text Embedding Using Sentence Transformers
def embed_text(texts):
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(texts)
return embeddings
# Normalize Embeddings and Create FAISS Index
def create_faiss_index(embeddings):
embeddings = normalize(embeddings, axis=1) # Normalize to unit vectors
dim = embeddings.shape[1] # Dimensionality of embeddings
index = faiss.IndexFlatL2(dim) # Use L2 distance for similarity
index.add(embeddings)
return index
# Search Function (RAG)
def search_query(query, index, model, top_k=3):
query_embedding = model.encode([query])
query_embedding = normalize(query_embedding, axis=1)
_, indices = index.search(query_embedding, top_k)
return indices[0]
# Visualization (Chunked Data & Search Results)
def plot_chunked_data(chunks):
fig, ax = plt.subplots(figsize=(10, 8)) # Increase figure size for better text placement
ax.barh(range(len(chunks)), [len(chunk.split()) for chunk in chunks], color='skyblue')
ax.set_title("Original Chunked Data Indices")
ax.set_xlabel("Number of Words per Chunk")
ax.set_ylabel("Chunk Index")
plt.yticks(range(len(chunks)), [f"Chunk {i}" for i in range(len(chunks))], fontsize=8)
# Wrap and place text within each bar
max_chars_per_line = 100 # Limit the number of characters per line in the bar
for i, chunk in enumerate(chunks):
# Wrap text to fit inside the bar
wrapped_text = textwrap.fill(chunk, width=max_chars_per_line)
ax.text(0, i, wrapped_text, ha='left', va='center', fontsize=9, color='black', wrap=True)
plt.tight_layout()
plt.show()
def display_retrieved_results(chunks, retrieved_indices, index):
# Create a list to hold the data for the table
data = []
for idx in retrieved_indices:
retrieved_chunk = chunks[idx]
dist = index.reconstruct(int(idx)).tolist()[0] # Get the distance as a list
data.append([idx, dist, retrieved_chunk])
# Create a DataFrame from the data
df = pd.DataFrame(data, columns=["Index", "Distance", "Chunk"])
# Function to wrap text to a maximum line length of 100 characters
def wrap_text(text, width=100):
return "<br>".join(textwrap.wrap(text, width=width))
# Apply the wrapping function to each cell in the DataFrame
wrapped_df = df.applymap(lambda x: wrap_text(str(x)))
# Display the table with custom styling: Bold headers, borders, and text wrapping in each cell
styled_df = wrapped_df.style.set_table_styles(
[{'selector': 'th', 'props': [('font-weight', 'bold'), ('border', '1px solid black')]}, # Bold headers and borders
{'selector': 'td', 'props': [('border', '1px solid black'), ('white-space', 'normal')]}, # Table cell borders and text wrapping
{'selector': 'table', 'props': [('border-collapse', 'collapse')]}] # Ensure borders collapse
)
# Display the styled DataFrame
print("\nRetrieved Chunks:")
display(styled_df)
# Putting it all together
if __name__ == "__main__":
# Sliding Window Chunking
chunks = sliding_window_chunking(patient_data, window_size=2, step_size=1)
# Embedding Chunks
embeddings = embed_text(chunks)
# FAISS Index Creation
index = create_faiss_index(embeddings)
# Plot Original Chunked Data
plot_chunked_data(chunks)
# User Query
query = "What are the recent updates on the patient's condition regarding diabetes?"
# Search Query in FAISS Index (RAG)
retrieved_indices = search_query(query, index, SentenceTransformer('all-MiniLM-L6-v2'), top_k=3)
# Display User Query (Simple Text)
print(f"\nUser Query: {query}")
# Display Retrieved Results
display_retrieved_results(chunks, retrieved_indices, index)
User Query: What are the recent updates on the patient's condition regarding diabetes? Retrieved Chunks:
| Index | Distance | Chunk | |
|---|---|---|---|
| 0 | 4 | 0.011669745668768883 | Patient had a follow-up consultation. Blood sugar levels have improved slightly, and weight reduction was observed. Patient underwent a full-body checkup. Heart rate is normal, and no signs of cardiovascular issues were detected. |
| 1 | 1 | -0.04715641960501671 | Patient had a surgery for knee replacement. Post-surgery recovery is on track, and pain levels are manageable. Patient diagnosed with type 2 diabetes. Current treatment includes metformin, and blood sugar levels are being monitored. |
| 2 | 2 | -0.02967824786901474 | Patient diagnosed with type 2 diabetes. Current treatment includes metformin, and blood sugar levels are being monitored. Patient complained of frequent headaches. CT scan suggests early stages of migraine, prescribed medication for pain relief. |