!pip install -q transformers torch matplotlib scikit-learn

WARNING: You are using pip version 21.2.4; however, version 25.0.1 is available.
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-415d1882-8acc-40b1-b575-ffee3113f454/bin/python -m pip install --upgrade pip' command.

import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from transformers import BertTokenizer, BertModel

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to generate dense vector for a given text
def generate_dense_vector(text):
    """
    Tokenizes the input text and extracts the [CLS] token embedding
    from BERT as the dense representation.
    """
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)

    cls_embedding = outputs.last_hidden_state[0][0]
    return cls_embedding.numpy()

# Function to visualize individual dense vector as a bar chart
def plot_dense_vector(vector, text):
    """
    Displays a bar chart of the 768-dimensional BERT embedding.
    """
    plt.figure(figsize=(16, 4))
    plt.bar(range(len(vector)), vector, color='skyblue')
    plt.title(f"Dense Vector Representation of full 768 dimension for Text:\n\"{text}\"", fontsize=14)
    plt.xlabel("Vector Dimensions")
    plt.ylabel("Value")
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()

# Function to compare and cluster multiple texts
def plot_texts_pca_with_clustering(texts, num_clusters=2):
    """
    Reduces text embeddings to 2D using PCA, clusters them with KMeans,
    and plots the result with color-coded clusters and text annotations.
    """
    # Generate embeddings
    vectors = [generate_dense_vector(text) for text in texts]

    # Run KMeans clustering on original 768D vectors
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(vectors)

    # Reduce dimensions for plotting
    pca = PCA(n_components=2)
    reduced = pca.fit_transform(vectors)

    # Plot
    plt.figure(figsize=(10, 7))
    colors = plt.cm.get_cmap('tab10', num_clusters)

    for i, (x, y) in enumerate(reduced):
        plt.scatter(x, y, color=colors(cluster_labels[i]), s=100, label=f'Cluster {cluster_labels[i]}' if f'Cluster {cluster_labels[i]}' not in plt.gca().get_legend_handles_labels()[1] else "")
        plt.text(x + 0.01, y + 0.01, texts[i], fontsize=9)

    plt.title("Clustering of BERT Dense Vectors with PCA", fontsize=14)
    plt.xlabel("PCA Component 1")
    plt.ylabel("PCA Component 2")
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.legend()
    plt.tight_layout()
    plt.show()

# Example texts
texts = [
    "Hello, this is a sample text.",
    "This is another example sentence.",
    "Artificial intelligence is transforming the world.",
    "I love watching science fiction movies.",
    "Machine learning powers many AI applications.",
    "My dog loves playing fetch in the park.",
    "Deep learning has revolutionized computer vision.",
    "The quick brown fox jumps over the lazy dog."
]

# Show bar chart for first example
dense_vector = generate_dense_vector(texts[0])
#print("Dense Vector Representation for Text:")
#print(dense_vector)
plot_dense_vector(dense_vector, texts[0])

# Plot clustering and PCA for all texts
plot_texts_pca_with_clustering(texts, num_clusters=3)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

Dense Vector¶