pip install faiss-cpu sentence-transformers transformers pandas scikit-learn

pip install tabulate

##version 1m
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from tabulate import tabulate
import textwrap
import re
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics.pairwise import cosine_similarity

# Define the clinical notes
clinical_notes = [
    "Patient reports a history of asthma and experiences shortness of breath occasionally.",
    "Patient is currently on a medication regimen of inhaled corticosteroids.",
    "The patient has a known allergy to penicillin.",
    "The patient was diagnosed with type 2 diabetes three years ago.",
    "No history of cardiovascular disease.",
    "Patient is currently being treated for hypertension.",
    "The patient has experienced no recent weight loss or significant fatigue.",
    "Patient was prescribed metformin for diabetes management.",
    "Family history includes heart disease and diabetes.",
    "Patient is allergic to peanuts."
]

# Perform Agentic Chunking (using LLM models for chunk labeling)
def perform_agentic_chunking(notes):
    chunked_data = []
    for note in notes:
        if "history" in note:
            chunked_data.append(("History", note))
        elif "allergy" in note or "allergic" in note:
            chunked_data.append(("Allergy", note))
        elif "medication" in note or "prescribed" in note:
            chunked_data.append(("Medication", note))
        elif "diagnosed" in note or "treated" in note:
            chunked_data.append(("Diagnosis", note))
        else:
            chunked_data.append(("Other", note))
    return chunked_data

# Label the chunks
chunked_data = perform_agentic_chunking(clinical_notes)

# Initialize the Sentence Transformer model to create embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each chunk
embeddings = []
for label, chunk in chunked_data:
    embeddings.append(model.encode([chunk])[0])

# Convert embeddings to numpy array for use with FAISS
embedding_matrix = np.array(embeddings).astype('float32')

# Set up a FAISS index to store the embeddings
index = faiss.IndexFlatL2(embedding_matrix.shape[1])  # L2 distance (Euclidean)
index.add(embedding_matrix)  # Add the embeddings to the index

# Querying: Define a user query
user_query = "What allergies does the patient have? and what has been prescribed?"

# Function to split the user query into individual questions
def split_query(query):
    # Split query by conjunctions like 'and', 'or'
    questions = re.split(r'\band\b|\bor\b', query)
    return [q.strip() for q in questions]

# Split the user query into individual questions
questions = split_query(user_query)

# Function to perform retrieval for each question and unify results
def perform_rag_for_multiple_questions(questions):
    unified_results = []
    for question in questions:
        query_embedding = model.encode([question])[0].reshape(1, -1)

        # Perform retrieval by querying the FAISS index
        D, I = index.search(query_embedding, k=2)  # Retrieve top 2 results

        # Collect the results for this specific question
        results = []
        for idx, dist in zip(I[0], D[0]):
            results.append([idx, chunked_data[idx][1], chunked_data[idx][0], dist])
        
        # Sort by cosine similarity distance for each question
        results.sort(key=lambda x: x[3], reverse=False)
        
        # Append the results of this question to the unified results
        unified_results.extend(results)
    
    return unified_results

# Perform retrieval for multiple questions and unify results
unified_retrieved_results = perform_rag_for_multiple_questions(questions)

# Display the original chunked data with embeddings
print("\nOriginal Chunked Data with Embeddings:")

# Function to wrap text to a specified width
def wrap_text(text, width=50):
    return "\n".join(textwrap.wrap(text, width))

chunked_data_table = []
for i, (label, chunk) in enumerate(chunked_data[:10]):  # Only show first 10 for brevity
    wrapped_chunk = wrap_text(chunk, width=50)  # Wrap chunk text to a width of 50 characters
    truncated_embedding = embeddings[i].tolist()[:5]  # Truncate embedding for readability
    chunked_data_table.append([i, label, wrapped_chunk, truncated_embedding])

# Display the chunked data in a readable table
print(tabulate(chunked_data_table, headers=["Index", "Label", "Chunk", "Embedding (First 5)"], tablefmt="pretty"))

# Display the user query
print("\nUser Query:", user_query)

# Print each question separately only if the user_query has more than one question
if len(questions) > 1:
    print("\nIndividual Questions:")
    for i, question in enumerate(questions, start=1):
        print(f"Question {i}: {question}")

# Display retrieved results from FAISS with cosine similarity
print("\nUnified Retrieved Results from Vector Database (Sorted by Cosine Similarity):")
print(tabulate(unified_retrieved_results, headers=["Index", "Chunk", "Label", "Cosine Similarity"], tablefmt="pretty"))

# 3D Vector Visualization
def plot_3d_cosine_similarity(query_embedding, unified_results):
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111, projection='3d')

    # Combine the query embedding and retrieved chunk embeddings for plotting
    embeddings_for_plotting = [query_embedding] + [embeddings[idx] for idx, _, _, _ in unified_results]
    labels = ["Query"] + [f"Chunk-{i}" for i in range(len(unified_results))]

    # Extract the 3D coordinates for the embeddings (taking the first 3 dimensions)
    embeddings_3d = np.array([embedding[:3] for embedding in embeddings_for_plotting])

    # Plot each point in 3D space
    ax.scatter(embeddings_3d[:, 0], embeddings_3d[:, 1], embeddings_3d[:, 2], color='b')

    # Label each point with its cosine similarity distance
    for i, label in enumerate(labels):
        color = 'red' if label == "Query" else 'green'
        ax.text(embeddings_3d[i, 0], embeddings_3d[i, 1], embeddings_3d[i, 2], label, color=color, fontsize=12)

    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')
    ax.set_title('3D Visualization of Query and Retrieved Chunks (Cosine Similarity)')

    # Show the plot
    plt.show()

# Get the embedding for the query
query_embedding = model.encode([user_query])[0]  # Get embedding for the user query

# Plot the 3D vector diagram
plot_3d_cosine_similarity(query_embedding, unified_retrieved_results)

Original Chunked Data with Embeddings:
+-------+------------+---------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
| Index |   Label    |                       Chunk                       |                                                Embedding (First 5)                                                |
+-------+------------+---------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
|   0   |  History   |      Patient reports a history of asthma and      |   [0.044502243399620056, 0.0701993927359581, -0.004177458118647337, 0.04402805492281914, -0.006464092992246151]   |
|       |            |   experiences shortness of breath occasionally.   |                                                                                                                   |
|   1   | Medication |  Patient is currently on a medication regimen of  |   [0.062104154378175735, 0.03556148707866669, -0.01986147277057171, -0.06374531239271164, -0.0810767114162445]    |
|       |            |             inhaled corticosteroids.              |                                                                                                                   |
|   2   |  Allergy   |  The patient has a known allergy to penicillin.   |  [0.02009832113981247, 0.012527728453278542, -0.01742633432149887, -0.02692382223904133, -0.027393842115998268]   |
|   3   | Diagnosis  |  The patient was diagnosed with type 2 diabetes   |     [0.0471331849694252, 0.016681335866451263, -0.0726516842842102, 0.0674281194806099, -0.08404749631881714]     |
|       |            |                 three years ago.                  |                                                                                                                   |
|   4   |  History   |       No history of cardiovascular disease.       |   [-0.007702366448938847, 0.09959591180086136, -0.06302200257778168, 0.04771367087960243, -0.05956929177045822]   |
|   5   | Diagnosis  |      Patient is currently being treated for       | [-0.031138945370912552, 0.06749303638935089, -0.05976958945393562, 0.0012347670271992683, -0.057038385421037674]  |
|       |            |                   hypertension.                   |                                                                                                                   |
|   6   |   Other    | The patient has experienced no recent weight loss |    [0.0429096482694149, 0.07027753442525864, -0.044426169246435165, 0.0680253803730011, -0.011801427230238914]    |
|       |            |              or significant fatigue.              |                                                                                                                   |
|   7   | Medication |   Patient was prescribed metformin for diabetes   |   [-0.04019788280129433, 0.034837037324905396, -0.06247192993760109, 0.06852300465106964, -0.10324154049158096]   |
|       |            |                    management.                    |                                                                                                                   |
|   8   |  History   |     Family history includes heart disease and     |  [-0.04784683138132095, 0.08995139598846436, -0.024482199922204018, 0.05959837883710861, -0.030672835186123848]   |
|       |            |                     diabetes.                     |                                                                                                                   |
|   9   |  Allergy   |          Patient is allergic to peanuts.          | [-0.0015491606900468469, 0.04758473113179207, -0.019133348017930984, 0.023979080840945244, -0.059614747762680054] |
+-------+------------+---------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+

User Query: What allergies does the patient have? and what has been prescribed?

Individual Questions:
Question 1: What allergies does the patient have?
Question 2: what has been prescribed?

Unified Retrieved Results from Vector Database (Sorted by Cosine Similarity):
+-------+--------------------------------------------------------------------------+------------+--------------------+
| Index |                                  Chunk                                   |   Label    | Cosine Similarity  |
+-------+--------------------------------------------------------------------------+------------+--------------------+
|   2   |              The patient has a known allergy to penicillin.              |  Allergy   | 0.6622791290283203 |
|   9   |                     Patient is allergic to peanuts.                      |  Allergy   | 0.747304379940033  |
|   1   | Patient is currently on a medication regimen of inhaled corticosteroids. | Medication | 1.1130704879760742 |
|   5   |           Patient is currently being treated for hypertension.           | Diagnosis  | 1.141774296760559  |
+-------+--------------------------------------------------------------------------+------------+--------------------+

Agentic Chunking¶

¶