%pip install -q numpy pandas faiss-cpu scikit-learn matplotlib seaborn

Note: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
petastorm 0.12.1 requires pyspark>=2.1.0, which is not installed.
databricks-feature-store 0.14.3 requires pyspark<4,>=3.1.2, which is not installed.
ydata-profiling 4.2.0 requires numpy<1.24,>=1.16.0, but you have numpy 2.1.3 which is incompatible.
ydata-profiling 4.2.0 requires scipy<1.11,>=1.4.1, but you have scipy 1.14.1 which is incompatible.
numba 0.55.1 requires numpy<1.22,>=1.18, but you have numpy 2.1.3 which is incompatible.
mleap 0.20.0 requires scikit-learn<0.23.0,>=0.22.0, but you have scikit-learn 1.1.1 which is incompatible.
langchain 0.0.217 requires numpy<2,>=1, but you have numpy 2.1.3 which is incompatible.
databricks-feature-store 0.14.3 requires numpy<2,>=1.19.2, but you have numpy 2.1.3 which is incompatible.
Note: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import TruncatedSVD
import faiss

# Step 1: Create Co-occurrence Matrix Embedding
def create_cooccurrence_matrix(sentence):
    words = sentence.split()
    word_to_idx = {word: idx for idx, word in enumerate(set(words))}
    co_matrix = np.zeros((len(word_to_idx), len(word_to_idx)))

    # Fill the co-occurrence matrix
    for i, word in enumerate(words[:-1]):
        word_idx = word_to_idx[word]
        next_word_idx = word_to_idx[words[i + 1]]
        co_matrix[word_idx, next_word_idx] += 1
        co_matrix[next_word_idx, word_idx] += 1  # Undirected graph

    return co_matrix, word_to_idx

# Step 2: Embed using SVD (Co-occurrence Matrix Embedding)
def embed_using_svd(co_matrix, n_components=2):
    svd = TruncatedSVD(n_components=n_components)
    return svd.fit_transform(co_matrix)

# Step 3: Store in FAISS
def store_in_faiss(embeddings):
    embeddings = embeddings.astype(np.float32)  # Ensure embeddings are float32
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)  # L2 distance metric
    faiss.normalize_L2(embeddings)  # Normalize vectors to unit length
    index.add(embeddings)
    return index

# Step 4: Retrieve Nearest Neighbour
def retrieve_nearest_neighbours(index, query_vector, k=5):
    D, I = index.search(query_vector.astype(np.float32), k)
    return I, D

# Step 5: Visualizing Co-occurrence Matrix as Heatmap
def plot_cooccurrence_heatmap(co_matrix, word_to_idx):
    plt.figure(figsize=(6, 6))
    sns.heatmap(co_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                xticklabels=word_to_idx.keys(), yticklabels=word_to_idx.keys())
    plt.title('Co-occurrence Matrix Heatmap')
    plt.xlabel('Words')
    plt.ylabel('Words')
    plt.show()

# Step 6: Tabular Representation of Embedding Data with Borders
def display_embedding_table(embeddings, word_to_idx):
    word_list = list(word_to_idx.keys())
    df = pd.DataFrame(embeddings, index=word_list)
    df.columns = [f'Embedding {i+1}' for i in range(embeddings.shape[1])]
    
    # Apply styling to add borders and improve readability
    styled_df = df.style.set_table_styles(
        [{'selector': 'thead th', 'props': [('border', '1px solid black'), ('background-color', '#f2f2f2')]},  # header style
         {'selector': 'tbody td', 'props': [('border', '1px solid black')]},  # cell border
         {'selector': 'table', 'props': [('border-collapse', 'collapse'), ('width', '100%')]},  # table styles
         {'selector': 'thead', 'props': [('text-align', 'center')]},  # center align header
         {'selector': 'tbody', 'props': [('text-align', 'center')]},  # center align body
        ])
    display(styled_df)

# Step 7: Display Nearest Neighbors in a Structured Table
def display_nearest_neighbors(word_to_idx, embeddings, index, words_to_check=[0, 1], k=5):
    # Create a table for nearest neighbors
    neighbors_data = []

    for word_idx in words_to_check:
        query_vector = embeddings[word_idx].reshape(1, -1)
        neighbors_idx, neighbors_dist = retrieve_nearest_neighbours(index, query_vector, k)
        neighbors_info = []
        
        for i, idx in enumerate(neighbors_idx[0]):
            neighbors_info.append({
                'Word': list(word_to_idx.keys())[idx],
                'Index': idx,
                'Distance': neighbors_dist[0][i]
            })
        neighbors_data.append(neighbors_info)
    
    # Convert neighbors data to pandas DataFrame for better display
    neighbors_df = pd.DataFrame(neighbors_data[0]) if neighbors_data else pd.DataFrame()
    neighbors_df.index = ['Word 1 Neighbours'] * len(neighbors_data[0])
    
    # Concatenate additional neighbors (e.g., for Word 2)
    for i, data in enumerate(neighbors_data[1:], start=2):
        word_neighbors_df = pd.DataFrame(data)
        word_neighbors_df.index = [f'Word {i} Neighbours'] * len(data)
        neighbors_df = pd.concat([neighbors_df, word_neighbors_df])
    
    # Display the nearest neighbors in tabular format
    styled_neighbors_df = neighbors_df.style.set_table_styles(
        [{'selector': 'thead th', 'props': [('border', '1px solid black'), ('background-color', '#f2f2f2')]},  # header style
         {'selector': 'tbody td', 'props': [('border', '1px solid black')]},  # cell border
         {'selector': 'table', 'props': [('border-collapse', 'collapse'), ('width', '100%')]},  # table styles
         {'selector': 'thead', 'props': [('text-align', 'center')]},  # center align header
         {'selector': 'tbody', 'props': [('text-align', 'center')]},  # center align body
        ])
    display(styled_neighbors_df)

# Step 8: Main function to tie everything together
def main(sentence):
    # Create Co-occurrence Matrix
    co_matrix, word_to_idx = create_cooccurrence_matrix(sentence)

    # Display the Co-occurrence Matrix Heatmap
    plot_cooccurrence_heatmap(co_matrix, word_to_idx)

    # Embed using SVD
    embeddings = embed_using_svd(co_matrix)

    # Store in FAISS
    index = store_in_faiss(embeddings)

    # Display Embedding Data with Borders and Headers
    display_embedding_table(embeddings, word_to_idx)

    # Display nearest neighbors for Word 1 and Word 2
    display_nearest_neighbors(word_to_idx, embeddings, index)

# Example sentence
#sentence = "apple orange banana apple banana fruit orange"
#sentence = """
#The cat sits on the mat.
#The dog lies on the mat.
#The cat and dog are friends.
#"""

sentence = """
The customer opened a savings account.
The savings account has a high interest rate.
"""
main(sentence)

	Embedding 1	Embedding 2
customer	0.624027	0.144949
savings	1.423712	-1.162395
account.	0.901047	0.365215
account	0.772586	0.842155
a	1.194755	1.381222
rate.	0.076639	-0.139605
interest	0.211836	0.342796
has	0.711759	-0.905482
high	0.508886	-0.702116
opened	0.658012	-0.621541
The	1.066833	0.265625

	Word	Index	Distance
Word 1 Neighbours	customer	0	0.129140
Word 1 Neighbours	The	10	0.129299
Word 1 Neighbours	account.	2	0.144869
Word 1 Neighbours	account	3	0.353094
Word 1 Neighbours	a	4	0.374677
Word 2 Neighbours	savings	1	0.702188
Word 2 Neighbours	opened	9	0.711767
Word 2 Neighbours	has	7	0.790725
Word 2 Neighbours	high	8	0.824729
Word 2 Neighbours	rate.	5	0.969961

Co-Occurrence Matrix¶