LLM KRL English to Tamil Version 1 - Bert model used

 

In Kaggle Notebook Published Here



 

import pandas as pd

import torch

from sklearn.model_selection import train_test_split, KFold

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, mean_squared_error, roc_auc_score, precision_recall_curve, auc, roc_curve

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

import numpy as np

import matplotlib.pyplot as plt

import random


# Expand dataset with 100 samples by duplicating and adding slight variations

sample_data = {

    "text_english": [

        "I am very happy to meet you", "I am disappointed with your work",

        "You have done an excellent job, well done", "This is not good, I expected better",

        "Thank you very much for your support", "I don't like your attitude",

        "I'm grateful for your guidance", "Your work lacks quality",

        "Well done, you've made us proud", "I appreciate your effort",

        "You are an inspiration", "I regret working with you"

    ],

    "text_tamil": [

        "உங்களை சந்திப்பதில் மிகவும் மகிழ்ச்சி", "உங்கள் வேலைக்கு நான் வருத்தப்படுகிறேன்",

        "நீங்கள் சிறந்த வேலை செய்தீர்கள், நல்லது", "இது நல்லதல்ல, நான் நல்லவை எதிர்பார்த்தேன்",

        "உங்கள் உதவிக்காக மிகவும் நன்றி", "உங்கள் அணுகுமுறை எனக்கு பிடிக்கவில்லை",

        "உங்கள் வழிகாட்டலுக்கு நான் நன்றி கூறுகிறேன்", "உங்கள் வேலை தரம் குறைவாக உள்ளது",

        "நல்ல செய்தி, நீங்கள் எங்களை பெருமைப்படுத்தினீர்கள்", "உங்கள் முயற்சியை நான் பாராட்டுகிறேன்",

        "நீங்கள் ஒரு பேரனுபவம்", "உங்களுடன் பணியாற்றியது வருத்தமாக உள்ளது"

    ],

    "respect": [

        "respect", None, "respect", None, "respect",

        None, "respect", None, "respect", "respect",

        "respect", None

    ]

}


# Generate 100 samples by duplicating and varying the sentences

def generate_data(sample_data, n_samples=100):

    expanded_data = {"text_english": [], "text_tamil": [], "respect": []}

    for _ in range(n_samples // len(sample_data["text_english"])):

        for i in range(len(sample_data["text_english"])):

            english_text = sample_data["text_english"][i] + (", thank you!" if random.random() > 0.5 else "")

            tamil_text = sample_data["text_tamil"][i]

            respect_label = sample_data["respect"][i]

            expanded_data["text_english"].append(english_text)

            expanded_data["text_tamil"].append(tamil_text)

            expanded_data["respect"].append(respect_label)

    return expanded_data


data = generate_data(sample_data, 100)

df = pd.DataFrame(data)


# Prepare input dataset with ID and respect classification

input_dataset = pd.DataFrame({

    "ID": range(1, len(df) + 1),

    "LanguageSource": df['text_english'],

    "Language_Translated": df['text_tamil'],

    "RespectContext": df['respect'].apply(lambda x: 1 if x == 'respect' else 0)

})


# Combine text features for model input

input_dataset['combined_text'] = input_dataset['LanguageSource'] + " " + input_dataset['Language_Translated']


# Define function to convert text data to dataset for transformers

class RespectDataset(torch.utils.data.Dataset):

    def __init__(self, encodings, labels):

        self.encodings = encodings

        self.labels = labels


    def __getitem__(self, idx):

        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

        item['labels'] = torch.tensor(self.labels[idx])

        return item


    def __len__(self):

        return len(self.labels)


# Initialize tokenizer and model

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


# Define training arguments with Early Stopping

training_args = TrainingArguments(

    output_dir='./results',

    eval_strategy="epoch",             # Updated for consistency with save strategy

    save_strategy="epoch",             # Ensure saving happens at the end of each epoch

    num_train_epochs=10,

    per_device_train_batch_size=4,

    per_device_eval_batch_size=4,

    logging_dir='./logs',

    logging_steps=10,

    load_best_model_at_end=True        # Keep the best model based on evaluation metric

)


# Early stopping callback

early_stopping = EarlyStoppingCallback(early_stopping_patience=3)


# K-Fold Cross Validation

kf = KFold(n_splits=5, shuffle=True, random_state=42)

accuracy_scores, f1_scores, rmse_scores, auc_scores, pr_auc_scores = [], [], [], [], []

mean_fpr = np.linspace(0, 1, 100)  # For averaging ROC curve across folds

tprs = []


for train_index, val_index in kf.split(input_dataset):

    train_texts = input_dataset.iloc[train_index]['combined_text'].tolist()

    val_texts = input_dataset.iloc[val_index]['combined_text'].tolist()

    train_labels = input_dataset.iloc[train_index]['RespectContext'].tolist()

    val_labels = input_dataset.iloc[val_index]['RespectContext'].tolist()

    

    # Tokenize the data

    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)

    val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

    

    # Create datasets

    train_dataset = RespectDataset(train_encodings, train_labels)

    val_dataset = RespectDataset(val_encodings, val_labels)

    

    # Trainer

    trainer = Trainer(

        model=model,

        args=training_args,

        train_dataset=train_dataset,

        eval_dataset=val_dataset,

        callbacks=[early_stopping]

    )

    

    # Train and evaluate the model

    trainer.train()

    predictions = trainer.predict(val_dataset)

    pred_probs = torch.nn.functional.softmax(torch.tensor(predictions.predictions), dim=-1).numpy()

    pred_labels = np.argmax(pred_probs, axis=1)

    

    # Calculate metrics

    accuracy = accuracy_score(val_labels, pred_labels)

    f1 = f1_score(val_labels, pred_labels)

    rmse = np.sqrt(mean_squared_error(val_labels, pred_labels))

    

    if len(np.unique(val_labels)) == 2:

        auc_score = roc_auc_score(val_labels, pred_probs[:, 1])

        precision, recall, _ = precision_recall_curve(val_labels, pred_probs[:, 1])

        pr_auc = auc(recall, precision)

        fpr, tpr, _ = roc_curve(val_labels, pred_probs[:, 1])

        

        # Interpolate to mean_fpr for average ROC curve

        tprs.append(np.interp(mean_fpr, fpr, tpr))

        auc_scores.append(auc_score)

        pr_auc_scores.append(pr_auc)

        

        # Plot ROC Curve for each fold

        plt.plot(fpr, tpr, alpha=0.3, label=f'Fold ROC AUC={auc_score:.2f}')

    else:

        auc_scores.append(np.nan)

        pr_auc_scores.append(np.nan)

    

    accuracy_scores.append(accuracy)

    f1_scores.append(f1)

    rmse_scores.append(rmse)

    

    # Binary Classification Scatter Plot

    plt.figure()

    plt.scatter(val_labels, pred_probs[:, 1], alpha=0.5, color='blue', label="Predicted Probability")

    plt.xlabel("True Label")

    plt.ylabel("Predicted Probability")

    plt.title("Binary Classification Scatter Plot")

    plt.legend()

    plt.show()


# Output cross-validation results

print("Cross-Validation Results:")

print(f"Mean Accuracy: {np.nanmean(accuracy_scores):.4f}")

print(f"Mean F1 Score: {np.nanmean(f1_scores):.4f}")

print(f"Mean RMSE: {np.nanmean(rmse_scores):.4f}")

print(f"Mean AUC-ROC: {np.nanmean(auc_scores):.4f}")

print(f"Mean Precision-Recall AUC: {np.nanmean(pr_auc_scores):.4f}")


# Plot average ROC curve across folds

plt.figure()

mean_tpr = np.mean

# Plot average ROC curve across folds

plt.figure()

mean_tpr = np.mean(tprs, axis=0)

mean_auc = auc(mean_fpr, mean_tpr)

plt.plot(mean_fpr, mean_tpr, color='b', label=f'Average ROC (AUC={mean_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line for random classifier

plt.xlabel('False Positive Rate')

plt.ylabel('True Positive Rate')

plt.title('Average ROC Curve Across Folds')

plt.legend(loc='best')

plt.show()


Comments

Popular posts from this blog

"How to maintain or retain tabs in same tab after button click events or postback?" using JQuery in ASP.NET C#

Login and Registration forms in C# windows application with Back end Microsoft SQL Server for data access

JSP and Servlet Form Submission without page refresh show results on the same page using Jquery AJAX