pip install transformers sentence-transformers faiss-cpu matplotlib

pip install scikit-learn

import faiss
import numpy as np
from transformers import pipeline, GPT2Tokenizer, GPT2LMHeadModel
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import textwrap  # Importing textwrap to wrap long text

# Step 1: Define the function to identify charges in the legal document
# For simplicity, simulate the detection of charges. Note : This can be refined using NLP techniques.
def identify_charges(document):
    # In a real-world scenario, charges would typically be extracted using Named Entity Recognition (NER) or Large Language Models (LLMs).
    # For the purpose of illustration, simulate this process by manually defining a few sample charges.
    
    charges = [
        {"charge": "Theft", "text": "John Doe is accused of stealing electronics from a retail store. The items include laptops and smartphones."},
        {"charge": "Fraud", "text": "John Doe allegedly submitted false tax returns, inflating his income to avoid paying taxes."},
        {"charge": "Assault", "text": "John Doe is accused of assaulting another person in a public space. The victim suffered minor injuries."}
    ]
    return charges

# Step 1: Define the function to identify charges in the legal document
def identify_charges(document):
    charges = [
        {"charge": "Theft", "text": "John Doe is accused of stealing electronics from a retail store. The items include laptops and smartphones."},
        {"charge": "Fraud", "text": "John Doe allegedly submitted false tax returns, inflating his income to avoid paying taxes."},
        {"charge": "Assault", "text": "John Doe is accused of assaulting another person in a public space. The victim suffered minor injuries."}
    ]
    return charges

# Step 2: Chunking the document based on identified charges
def create_chunks_from_charges(charges):
    chunks = []
    for charge in charges:
        chunk = charge["text"]
        chunks.append(chunk)
    return chunks

# Example legal document (simulated as charges for simplicity)
document = """
John Doe is accused of stealing electronics from a retail store. The items include laptops and smartphones. 
John Doe allegedly submitted false tax returns, inflating his income to avoid paying taxes. 
John Doe is accused of assaulting another person in a public space. The victim suffered minor injuries.
"""

# Step 3: Embedding chunks using Sentence Transformers
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Identify charges in the document
charges = identify_charges(document)

# Create chunks based on the charges
chunks = create_chunks_from_charges(charges)

# Generate embeddings for the chunks
chunk_embeddings = embedding_model.encode(chunks)

# Step 4: Storing embeddings in FAISS (Vector Database)
index = faiss.IndexFlatL2(chunk_embeddings.shape[1])  
index.add(np.array(chunk_embeddings))

# Step 5: Retrieval-Augmented Generation (RAG) - Retrieving relevant chunks based on query
def retrieve_relevant_chunks(query, k=3):
    query_embedding = embedding_model.encode([query])  
    D, I = index.search(np.array(query_embedding), k)
    relevant_chunks = [chunks[i] for i in I[0]]
    return relevant_chunks

# Step 6: Query the system
user_query = "Tell me about the theft charge."
retrieved_chunks = retrieve_relevant_chunks(user_query, k=1)
print("\nLegal Document: ", document)
print("\nUser Query: ", user_query)
print("\nTop-k Retrieved Chunks for the Query:")
print(retrieved_chunks)
# Step 7: Recombine Chunks into a Unified Response
def generate_response_from_chunks(chunks):
    return " ".join(chunks)

unified_response = generate_response_from_chunks(retrieved_chunks)
#print("\nUnified Response:")
#print(unified_response)

# Step 8: Visualization of Embedding Data and Query Results

def plot_embeddings():
    pca = PCA(n_components=2)
    reduced_embeddings = pca.fit_transform(chunk_embeddings)

    query_embedding = embedding_model.encode([user_query])
    reduced_query_embedding = pca.transform(query_embedding)

    plt.figure(figsize=(12, 6))

    # Plot Chunk Bar Graph
    plt.subplot(1, 2, 1)
    word_counts = [len(chunk.split()) for chunk in chunks]
    plt.bar(range(len(chunks)), word_counts, color='lightblue')
    plt.title('Chunked Data in Vector Database')
    plt.xlabel('Chunk Index')
    plt.ylabel('Word Count')
    plt.xticks(range(len(chunks)), range(len(chunks)))

    # Add wrapped chunk data inside the bars
    for i, chunk in enumerate(chunks):
        # Wrap the chunk text using textwrap to fit it neatly inside the bar
        wrapped_text = textwrap.fill(chunk, width=20)  # Adjust width as needed
        plt.text(i, word_counts[i] / 2, wrapped_text, ha='center', va='center', fontsize=8, color='black')

    for i, txt in enumerate(chunks):
        plt.annotate(f"{i}", (reduced_embeddings[i, 0], reduced_embeddings[i, 1]))

# Plot the embeddings and the query results
plot_embeddings()

Legal Document:  
John Doe is accused of stealing electronics from a retail store. The items include laptops and smartphones. 
John Doe allegedly submitted false tax returns, inflating his income to avoid paying taxes. 
John Doe is accused of assaulting another person in a public space. The victim suffered minor injuries.


User Query:  Tell me about the theft charge.

Top-k Retrieved Chunks for the Query:
['John Doe is accused of stealing electronics from a retail store. The items include laptops and smartphones.']

Agentic Chunking¶

Example:¶

¶