Skip to main content

LLM KRL English to Tamil Version 1 - Bert model used

 

In Kaggle Notebook Published Here



 

import pandas as pd

import torch

from sklearn.model_selection import train_test_split, KFold

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, mean_squared_error, roc_auc_score, precision_recall_curve, auc, roc_curve

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

import numpy as np

import matplotlib.pyplot as plt

import random


# Expand dataset with 100 samples by duplicating and adding slight variations

sample_data = {

    "text_english": [

        "I am very happy to meet you", "I am disappointed with your work",

        "You have done an excellent job, well done", "This is not good, I expected better",

        "Thank you very much for your support", "I don't like your attitude",

        "I'm grateful for your guidance", "Your work lacks quality",

        "Well done, you've made us proud", "I appreciate your effort",

        "You are an inspiration", "I regret working with you"

    ],

    "text_tamil": [

        "உங்களை சந்திப்பதில் à®®ிகவுà®®் மகிà®´்ச்சி", "உங்கள் வேலைக்கு நான் வருத்தப்படுகிà®±ேன்",

        "நீà®™்கள் சிறந்த வேலை செய்தீà®°்கள், நல்லது", "இது நல்லதல்ல, நான் நல்லவை எதிà®°்பாà®°்த்தேன்",

        "உங்கள் உதவிக்காக à®®ிகவுà®®் நன்à®±ி", "உங்கள் அணுகுà®®ுà®±ை எனக்கு பிடிக்கவில்லை",

        "உங்கள் வழிகாட்டலுக்கு நான் நன்à®±ி கூà®±ுகிà®±ேன்", "உங்கள் வேலை தரம் குà®±ைவாக உள்ளது",

        "நல்ல செய்தி, நீà®™்கள் எங்களை பெà®°ுà®®ைப்படுத்தினீà®°்கள்", "உங்கள் à®®ுயற்சியை நான் பாà®°ாட்டுகிà®±ேன்",

        "நீà®™்கள் à®’à®°ு பேரனுபவம்", "உங்களுடன் பணியாà®±்à®±ியது வருத்தமாக உள்ளது"

    ],

    "respect": [

        "respect", None, "respect", None, "respect",

        None, "respect", None, "respect", "respect",

        "respect", None

    ]

}


# Generate 100 samples by duplicating and varying the sentences

def generate_data(sample_data, n_samples=100):

    expanded_data = {"text_english": [], "text_tamil": [], "respect": []}

    for _ in range(n_samples // len(sample_data["text_english"])):

        for i in range(len(sample_data["text_english"])):

            english_text = sample_data["text_english"][i] + (", thank you!" if random.random() > 0.5 else "")

            tamil_text = sample_data["text_tamil"][i]

            respect_label = sample_data["respect"][i]

            expanded_data["text_english"].append(english_text)

            expanded_data["text_tamil"].append(tamil_text)

            expanded_data["respect"].append(respect_label)

    return expanded_data


data = generate_data(sample_data, 100)

df = pd.DataFrame(data)


# Prepare input dataset with ID and respect classification

input_dataset = pd.DataFrame({

    "ID": range(1, len(df) + 1),

    "LanguageSource": df['text_english'],

    "Language_Translated": df['text_tamil'],

    "RespectContext": df['respect'].apply(lambda x: 1 if x == 'respect' else 0)

})


# Combine text features for model input

input_dataset['combined_text'] = input_dataset['LanguageSource'] + " " + input_dataset['Language_Translated']


# Define function to convert text data to dataset for transformers

class RespectDataset(torch.utils.data.Dataset):

    def __init__(self, encodings, labels):

        self.encodings = encodings

        self.labels = labels


    def __getitem__(self, idx):

        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

        item['labels'] = torch.tensor(self.labels[idx])

        return item


    def __len__(self):

        return len(self.labels)


# Initialize tokenizer and model

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


# Define training arguments with Early Stopping

training_args = TrainingArguments(

    output_dir='./results',

    eval_strategy="epoch",             # Updated for consistency with save strategy

    save_strategy="epoch",             # Ensure saving happens at the end of each epoch

    num_train_epochs=10,

    per_device_train_batch_size=4,

    per_device_eval_batch_size=4,

    logging_dir='./logs',

    logging_steps=10,

    load_best_model_at_end=True        # Keep the best model based on evaluation metric

)


# Early stopping callback

early_stopping = EarlyStoppingCallback(early_stopping_patience=3)


# K-Fold Cross Validation

kf = KFold(n_splits=5, shuffle=True, random_state=42)

accuracy_scores, f1_scores, rmse_scores, auc_scores, pr_auc_scores = [], [], [], [], []

mean_fpr = np.linspace(0, 1, 100)  # For averaging ROC curve across folds

tprs = []


for train_index, val_index in kf.split(input_dataset):

    train_texts = input_dataset.iloc[train_index]['combined_text'].tolist()

    val_texts = input_dataset.iloc[val_index]['combined_text'].tolist()

    train_labels = input_dataset.iloc[train_index]['RespectContext'].tolist()

    val_labels = input_dataset.iloc[val_index]['RespectContext'].tolist()

    

    # Tokenize the data

    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)

    val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

    

    # Create datasets

    train_dataset = RespectDataset(train_encodings, train_labels)

    val_dataset = RespectDataset(val_encodings, val_labels)

    

    # Trainer

    trainer = Trainer(

        model=model,

        args=training_args,

        train_dataset=train_dataset,

        eval_dataset=val_dataset,

        callbacks=[early_stopping]

    )

    

    # Train and evaluate the model

    trainer.train()

    predictions = trainer.predict(val_dataset)

    pred_probs = torch.nn.functional.softmax(torch.tensor(predictions.predictions), dim=-1).numpy()

    pred_labels = np.argmax(pred_probs, axis=1)

    

    # Calculate metrics

    accuracy = accuracy_score(val_labels, pred_labels)

    f1 = f1_score(val_labels, pred_labels)

    rmse = np.sqrt(mean_squared_error(val_labels, pred_labels))

    

    if len(np.unique(val_labels)) == 2:

        auc_score = roc_auc_score(val_labels, pred_probs[:, 1])

        precision, recall, _ = precision_recall_curve(val_labels, pred_probs[:, 1])

        pr_auc = auc(recall, precision)

        fpr, tpr, _ = roc_curve(val_labels, pred_probs[:, 1])

        

        # Interpolate to mean_fpr for average ROC curve

        tprs.append(np.interp(mean_fpr, fpr, tpr))

        auc_scores.append(auc_score)

        pr_auc_scores.append(pr_auc)

        

        # Plot ROC Curve for each fold

        plt.plot(fpr, tpr, alpha=0.3, label=f'Fold ROC AUC={auc_score:.2f}')

    else:

        auc_scores.append(np.nan)

        pr_auc_scores.append(np.nan)

    

    accuracy_scores.append(accuracy)

    f1_scores.append(f1)

    rmse_scores.append(rmse)

    

    # Binary Classification Scatter Plot

    plt.figure()

    plt.scatter(val_labels, pred_probs[:, 1], alpha=0.5, color='blue', label="Predicted Probability")

    plt.xlabel("True Label")

    plt.ylabel("Predicted Probability")

    plt.title("Binary Classification Scatter Plot")

    plt.legend()

    plt.show()


# Output cross-validation results

print("Cross-Validation Results:")

print(f"Mean Accuracy: {np.nanmean(accuracy_scores):.4f}")

print(f"Mean F1 Score: {np.nanmean(f1_scores):.4f}")

print(f"Mean RMSE: {np.nanmean(rmse_scores):.4f}")

print(f"Mean AUC-ROC: {np.nanmean(auc_scores):.4f}")

print(f"Mean Precision-Recall AUC: {np.nanmean(pr_auc_scores):.4f}")


# Plot average ROC curve across folds

plt.figure()

mean_tpr = np.mean

# Plot average ROC curve across folds

plt.figure()

mean_tpr = np.mean(tprs, axis=0)

mean_auc = auc(mean_fpr, mean_tpr)

plt.plot(mean_fpr, mean_tpr, color='b', label=f'Average ROC (AUC={mean_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line for random classifier

plt.xlabel('False Positive Rate')

plt.ylabel('True Positive Rate')

plt.title('Average ROC Curve Across Folds')

plt.legend(loc='best')

plt.show()


Comments

Popular posts from this blog

Guidewire Rating detailed explanations

Guidewire Rating is a critical component of Guidewire PolicyCenter that handles the calculation of insurance premiums based on various factors, such as risk characteristics, coverage options, and discounts. Here's a detailed explanation of how Guidewire Rating works, its components, and how it can be configured and extended. 1. Overview of Guidewire Rating Guidewire Rating is responsible for determining the price of an insurance policy by applying rating logic, rules, and algorithms to the insured's data. The rating process involves evaluating factors like the type of coverage, the insured risk (e.g., the driver’s record, vehicle type in auto insurance), and the chosen limits and deductibles. The output is a premium amount that the policyholder must pay. 2. Key Components of Guidewire Rating a. Rating Engine The Rating Engine is the core system within PolicyCenter that processes rating inputs and outputs premium calculations. It interprets rating formulas, applies them to speci...

Java Swing MySql JDBC: insert data into database

Program import javax.swing.*; import java.awt.*; import java.awt.event.*; import java.sql.*; public class insertswing implements ActionListener {   JFrame fr;JPanel po;   JLabel l1,l2,main;   JTextField tf1,tf2;   GridBagConstraints gbc;   GridBagLayout go;   JButton ok,exit; public insertswing(){ fr=new JFrame("New User Data "); Font f=new Font("Verdana",Font.BOLD,24); po=new JPanel(); fr.getContentPane().add(po); fr.setVisible(true); fr.setSize(1024,768); fr.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); po.setBackground(Color.WHITE); go=new GridBagLayout(); gbc=new GridBagConstraints(); po.setLayout(go); main=new JLabel("Enter User Details "); main.setFont(f); l1=new JLabel("Name  :");tf1=new JTextField(20); l2=new JLabel("User Name  :");tf2=new JTextField(20); ok=new JButton("Accept"); exit=new JButton("Exit"); gbc.anchor=GridBagConstraints.NORTH;gbc.gridx=5;gbc.gridy=0; go.s...

JSP and Servlet Form Submission without page refresh show results on the same page using Jquery AJAX

Code Snippet HTML Form  <form id='ajaxform' name='ajaxform' action='ajaxformexample' method='post'>  First Name: <input type='text' id='firstname' name='firstname' size='30' required/><br/>  Last Name: <input type='text' id='lastname' name='lastname' size='30'required/><br/>  Email:  <input type='email' id='emailid' name='emailid' size='30'required/><br/>  Password:  <input type='password' id='pwd' name='pwd' size='30'required/><br/>  <input type='Submit'/>   <div id='content'> </div> </form> the above HTML Form uses post method and url servlet redirect to " ajaxformexample " Javascript Code  var form = $('#ajaxform'); // id of form tag  form.submit(function () {  $.ajax({  ...