!pip install -q scikit-learn matplotlib

WARNING: You are using pip version 21.2.4; however, version 25.0.1 is available.
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-63702505-ff59-4935-bdaa-11477688f897/bin/python -m pip install --upgrade pip' command.

## version 0.3
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import numpy as np

# Sample corpus for training the TF-IDF vectorizer
corpus = [
    "Hello, this is a sample text.",
    "The kangaroo sprinted from the bush in dawn towards the highway.",
    "This text is another example.",
    "Text data can be represented as sparse vectors."
]

# Create and fit the TF-IDF vectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

# Function to generate and plot a sparse vector
def visualize_sparse_vector(text, label=""):
    sparse_vector = vectorizer.transform([text])
    dense_array = sparse_vector.toarray().flatten()

    # Get feature names (words)
    feature_names = np.array(vectorizer.get_feature_names())

    # Only show non-zero values
    non_zero_indices = np.where(dense_array > 0)[0]
    non_zero_values = dense_array[non_zero_indices]
    non_zero_words = feature_names[non_zero_indices]

    # Plotting
    plt.figure(figsize=(12, 4))
    plt.bar(non_zero_words, non_zero_values, color='skyblue')
    plt.xticks(rotation=45, ha='right')
    plt.title(f"{label}: {text}", fontsize=14)
    plt.xlabel("Words")
    plt.ylabel("TF-IDF Score")
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()

    # For inspection
    print(f"Sparse Vector ({label}):\nText: {text}")
    print(sparse_vector)

# Example texts to compare
text1 = "The kangaroo sprinted from the bush in dawn towards the highway."
text2 = "In dawn the kangaroo sprinted from the highway towards the bush."

# Visualize both sparse vectors
visualize_sparse_vector(text1, label="Original Sentence")
visualize_sparse_vector(text2, label="Reordered Sentence")

Sparse Vector (Original Sentence):
Text: The kangaroo sprinted from the bush in dawn towards the highway.
  (0, 21)	0.2425356250363329
  (0, 19)	0.7276068751089987
  (0, 17)	0.2425356250363329
  (0, 13)	0.2425356250363329
  (0, 11)	0.2425356250363329
  (0, 10)	0.2425356250363329
  (0, 8)	0.2425356250363329
  (0, 6)	0.2425356250363329
  (0, 3)	0.2425356250363329

Sparse Vector (Reordered Sentence):
Text: In dawn the kangaroo sprinted from the highway towards the bush.
  (0, 21)	0.2425356250363329
  (0, 19)	0.7276068751089987
  (0, 17)	0.2425356250363329
  (0, 13)	0.2425356250363329
  (0, 11)	0.2425356250363329
  (0, 10)	0.2425356250363329
  (0, 8)	0.2425356250363329
  (0, 6)	0.2425356250363329
  (0, 3)	0.2425356250363329

Sparse Vectors Embedding¶