%pip install -q gensim faiss-cpu numpy pandas matplotlib

Note: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
petastorm 0.12.1 requires pyspark>=2.1.0, which is not installed.
databricks-feature-store 0.14.3 requires pyspark<4,>=3.1.2, which is not installed.
ydata-profiling 4.2.0 requires numpy<1.24,>=1.16.0, but you have numpy 1.26.4 which is incompatible.
ydata-profiling 4.2.0 requires scipy<1.11,>=1.4.1, but you have scipy 1.13.1 which is incompatible.
numba 0.55.1 requires numpy<1.22,>=1.18, but you have numpy 1.26.4 which is incompatible.
mleap 0.20.0 requires scikit-learn<0.23.0,>=0.22.0, but you have scikit-learn 1.1.1 which is incompatible.
Note: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.

import numpy as np
import faiss
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors

# Load pre-trained GloVe embeddings
def load_glove_model():
    glove_input_file = '/dbfs/FileStore/kumar.palakshappa@au.ey.com/glove_6B_100d.txt'
    word2vec_output_file = '/dbfs/FileStore/kumar.palakshappa@au.ey.com/glove_6B_100d.word2vec.txt'
    # Convert GloVe file to Word2Vec format
    glove2word2vec(glove_input_file, word2vec_output_file)
   
    # Load the converted model
    model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
    return model

# Sentence to vector using GloVe model
def sentence_to_vector(sentence, model):
    words = sentence.split()
    word_vectors = []
    for word in words:
        if word in model.key_to_index:
            word_vectors.append(model[word])
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Store sentence embeddings in FAISS
def create_faiss_index(sentences, model):
    sentence_vectors = np.array([sentence_to_vector(sentence, model) for sentence in sentences])
    sentence_vectors = normalize(sentence_vectors)  # Normalize vectors for cosine similarity
    dim = sentence_vectors.shape[1]
    index = faiss.IndexFlatL2(dim)  # Using L2 (Euclidean) distance as proxy for cosine similarity
    index.add(sentence_vectors)
    return index, sentence_vectors

# Query the FAISS index for top k nearest neighbors
def search_faiss_index(query, index, k, model):
    query_vector = sentence_to_vector(query, model).reshape(1, -1)
    query_vector = normalize(query_vector)  # Normalize query vector
    distances, indices = index.search(query_vector, k)
    return distances, indices

# Display results in tabular form with borders
def display_table(data, title):
    df = pd.DataFrame(data, columns=['Index', 'Sentence'])
    print(f"\n{title}")
    print(df.to_markdown(index=False))  # Use markdown to get nice table formatting

# Function to plot result_vector[:10] as a circular graph
def plot_radial_graph(result_vector, words):
    # Take the first 10 dimensions of the result vector
    result_vector_10d = result_vector[:10]
    
    # Create angles for each component (equally spaced around a circle)
    angles = np.linspace(0, 2 * np.pi, len(result_vector_10d), endpoint=False)
    
    # Plot the radial graph
    fig, ax = plt.subplots(subplot_kw={'projection': 'polar'}, figsize=(4.5, 4.5)) #figsize=(8, 8))
    ax.set_theta_offset(np.pi / 2)  # Set the start angle at the top (0 radians)
    ax.set_theta_direction(-1)  # Make the angle go counter-clockwise

    # Plot each value in the vector on the circle
    ax.plot(angles, result_vector_10d, color='b', linewidth=2, label='result_vector[:10]')
    
    # Fill the area under the curve to make it more visual
    ax.fill(angles, result_vector_10d, color='b', alpha=0.3)

    # Set the labels for each component (use the words "law", "services", "business", etc.)
    ax.set_xticks(angles)
    ax.set_xticklabels(words, fontsize=12)

    # Title and display
    #ax.set_title("Radial Plot of result_vector[:10]")
    ax.set_title("Radial Plot of Word Representation (displaying only 10 vectors)")
    plt.show()

# Main Function: Integration of the Process
def main():
    # Step 1: Load GloVe Model
    model = load_glove_model()

    # Step 2: Example Law Firm Sentences
    sentences = [
        "Our law firm specializes in corporate law and mergers.",
        "We offer expertise in intellectual property litigation.",
        "Legal consulting for international businesses is our key strength.",
        "Our attorneys have extensive experience in criminal defense.",
        "We represent clients in matters of real estate law.",
        "Our firm is known for handling complex commercial disputes.",
        "We are committed to providing excellent legal advice.",
        "Our team includes experts in family and estate law.",
        "We offer trusted legal services to small businesses.",
        "We are leaders in the field of tax law."
    ]

    # Step 3: Store sentences in FAISS
    index, sentence_vectors = create_faiss_index(sentences, model)

    # Step 4: Display stored sentences in a table
    stored_sentences_data = [(i, sentences[i]) for i in range(len(sentences))]
    display_table(stored_sentences_data, "Stored Sentences")

    # Step 5: User Query
    user_query = "What legal services do you offer for businesses?"

    # Step 6: Search for the top-k similar sentences
    k = 3
    distances, indices = search_faiss_index(user_query, index, k, model)

    # Step 7: Display the query and top-k completions
    print(f"\nUser Query: {user_query}\n")

    predicted_sentences_data = [(indices[0][i], sentences[indices[0][i]]) for i in range(k)]
    display_table(predicted_sentences_data, f"Predicted Sentence Completions (Top {k})")

    # Step 8: Word Representation Example (e.g., law - services + business)
    word_rep_example = "law - services + business"
    vec_1 = model['law']
    vec_2 = model['services']
    vec_3 = model['business']
    result_vector = vec_1 - vec_2 + vec_3

    print(f"\nWord Representation Example: {word_rep_example}")
    print(f"Vector Result for {word_rep_example}:")
    print(result_vector[:10])  # Display first 10 dimensions for brevity

    # Words used in the example
    words = ['law', 'services', 'business', 'services', 'law', 'business', 'law', 'services', 'business', 'law']


    # Plotting the radial graph for result_vector[:10] with word labels
    plot_radial_graph(result_vector, words)

if __name__ == "__main__":
    main()

/root/.ipykernel/10005/command-757657784830271-3992536538:14: DeprecationWarning: Call to deprecated `glove2word2vec` (KeyedVectors.load_word2vec_format(.., binary=False, no_header=True) loads GLoVE text vectors.).
  glove2word2vec(glove_input_file, word2vec_output_file)

Stored Sentences
|   Index | Sentence                                                           |
|--------:|:-------------------------------------------------------------------|
|       0 | Our law firm specializes in corporate law and mergers.             |
|       1 | We offer expertise in intellectual property litigation.            |
|       2 | Legal consulting for international businesses is our key strength. |
|       3 | Our attorneys have extensive experience in criminal defense.       |
|       4 | We represent clients in matters of real estate law.                |
|       5 | Our firm is known for handling complex commercial disputes.        |
|       6 | We are committed to providing excellent legal advice.              |
|       7 | Our team includes experts in family and estate law.                |
|       8 | We offer trusted legal services to small businesses.               |
|       9 | We are leaders in the field of tax law.                            |

User Query: What legal services do you offer for businesses?


Predicted Sentence Completions (Top 3)
|   Index | Sentence                                                           |
|--------:|:-------------------------------------------------------------------|
|       8 | We offer trusted legal services to small businesses.               |
|       6 | We are committed to providing excellent legal advice.              |
|       2 | Legal consulting for international businesses is our key strength. |

Word Representation Example: law - services + business
Vector Result for law - services + business:
[ 0.919277   -0.85179806  0.34988397 -0.44417    -0.415442    0.6492
 -0.35244995 -0.96955    -0.31828     0.275139  ]

GloVe (Global Vectors for Word Representation)¶