Skip to main content

LLM KRL English to Tamil Version 1 - Bert model used

 

In Kaggle Notebook Published Here



 

import pandas as pd

import torch

from sklearn.model_selection import train_test_split, KFold

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, mean_squared_error, roc_auc_score, precision_recall_curve, auc, roc_curve

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

import numpy as np

import matplotlib.pyplot as plt

import random


# Expand dataset with 100 samples by duplicating and adding slight variations

sample_data = {

    "text_english": [

        "I am very happy to meet you", "I am disappointed with your work",

        "You have done an excellent job, well done", "This is not good, I expected better",

        "Thank you very much for your support", "I don't like your attitude",

        "I'm grateful for your guidance", "Your work lacks quality",

        "Well done, you've made us proud", "I appreciate your effort",

        "You are an inspiration", "I regret working with you"

    ],

    "text_tamil": [

        "உங்களை சந்திப்பதில் à®®ிகவுà®®் மகிà®´்ச்சி", "உங்கள் வேலைக்கு நான் வருத்தப்படுகிà®±ேன்",

        "நீà®™்கள் சிறந்த வேலை செய்தீà®°்கள், நல்லது", "இது நல்லதல்ல, நான் நல்லவை எதிà®°்பாà®°்த்தேன்",

        "உங்கள் உதவிக்காக à®®ிகவுà®®் நன்à®±ி", "உங்கள் அணுகுà®®ுà®±ை எனக்கு பிடிக்கவில்லை",

        "உங்கள் வழிகாட்டலுக்கு நான் நன்à®±ி கூà®±ுகிà®±ேன்", "உங்கள் வேலை தரம் குà®±ைவாக உள்ளது",

        "நல்ல செய்தி, நீà®™்கள் எங்களை பெà®°ுà®®ைப்படுத்தினீà®°்கள்", "உங்கள் à®®ுயற்சியை நான் பாà®°ாட்டுகிà®±ேன்",

        "நீà®™்கள் à®’à®°ு பேரனுபவம்", "உங்களுடன் பணியாà®±்à®±ியது வருத்தமாக உள்ளது"

    ],

    "respect": [

        "respect", None, "respect", None, "respect",

        None, "respect", None, "respect", "respect",

        "respect", None

    ]

}


# Generate 100 samples by duplicating and varying the sentences

def generate_data(sample_data, n_samples=100):

    expanded_data = {"text_english": [], "text_tamil": [], "respect": []}

    for _ in range(n_samples // len(sample_data["text_english"])):

        for i in range(len(sample_data["text_english"])):

            english_text = sample_data["text_english"][i] + (", thank you!" if random.random() > 0.5 else "")

            tamil_text = sample_data["text_tamil"][i]

            respect_label = sample_data["respect"][i]

            expanded_data["text_english"].append(english_text)

            expanded_data["text_tamil"].append(tamil_text)

            expanded_data["respect"].append(respect_label)

    return expanded_data


data = generate_data(sample_data, 100)

df = pd.DataFrame(data)


# Prepare input dataset with ID and respect classification

input_dataset = pd.DataFrame({

    "ID": range(1, len(df) + 1),

    "LanguageSource": df['text_english'],

    "Language_Translated": df['text_tamil'],

    "RespectContext": df['respect'].apply(lambda x: 1 if x == 'respect' else 0)

})


# Combine text features for model input

input_dataset['combined_text'] = input_dataset['LanguageSource'] + " " + input_dataset['Language_Translated']


# Define function to convert text data to dataset for transformers

class RespectDataset(torch.utils.data.Dataset):

    def __init__(self, encodings, labels):

        self.encodings = encodings

        self.labels = labels


    def __getitem__(self, idx):

        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

        item['labels'] = torch.tensor(self.labels[idx])

        return item


    def __len__(self):

        return len(self.labels)


# Initialize tokenizer and model

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


# Define training arguments with Early Stopping

training_args = TrainingArguments(

    output_dir='./results',

    eval_strategy="epoch",             # Updated for consistency with save strategy

    save_strategy="epoch",             # Ensure saving happens at the end of each epoch

    num_train_epochs=10,

    per_device_train_batch_size=4,

    per_device_eval_batch_size=4,

    logging_dir='./logs',

    logging_steps=10,

    load_best_model_at_end=True        # Keep the best model based on evaluation metric

)


# Early stopping callback

early_stopping = EarlyStoppingCallback(early_stopping_patience=3)


# K-Fold Cross Validation

kf = KFold(n_splits=5, shuffle=True, random_state=42)

accuracy_scores, f1_scores, rmse_scores, auc_scores, pr_auc_scores = [], [], [], [], []

mean_fpr = np.linspace(0, 1, 100)  # For averaging ROC curve across folds

tprs = []


for train_index, val_index in kf.split(input_dataset):

    train_texts = input_dataset.iloc[train_index]['combined_text'].tolist()

    val_texts = input_dataset.iloc[val_index]['combined_text'].tolist()

    train_labels = input_dataset.iloc[train_index]['RespectContext'].tolist()

    val_labels = input_dataset.iloc[val_index]['RespectContext'].tolist()

    

    # Tokenize the data

    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)

    val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

    

    # Create datasets

    train_dataset = RespectDataset(train_encodings, train_labels)

    val_dataset = RespectDataset(val_encodings, val_labels)

    

    # Trainer

    trainer = Trainer(

        model=model,

        args=training_args,

        train_dataset=train_dataset,

        eval_dataset=val_dataset,

        callbacks=[early_stopping]

    )

    

    # Train and evaluate the model

    trainer.train()

    predictions = trainer.predict(val_dataset)

    pred_probs = torch.nn.functional.softmax(torch.tensor(predictions.predictions), dim=-1).numpy()

    pred_labels = np.argmax(pred_probs, axis=1)

    

    # Calculate metrics

    accuracy = accuracy_score(val_labels, pred_labels)

    f1 = f1_score(val_labels, pred_labels)

    rmse = np.sqrt(mean_squared_error(val_labels, pred_labels))

    

    if len(np.unique(val_labels)) == 2:

        auc_score = roc_auc_score(val_labels, pred_probs[:, 1])

        precision, recall, _ = precision_recall_curve(val_labels, pred_probs[:, 1])

        pr_auc = auc(recall, precision)

        fpr, tpr, _ = roc_curve(val_labels, pred_probs[:, 1])

        

        # Interpolate to mean_fpr for average ROC curve

        tprs.append(np.interp(mean_fpr, fpr, tpr))

        auc_scores.append(auc_score)

        pr_auc_scores.append(pr_auc)

        

        # Plot ROC Curve for each fold

        plt.plot(fpr, tpr, alpha=0.3, label=f'Fold ROC AUC={auc_score:.2f}')

    else:

        auc_scores.append(np.nan)

        pr_auc_scores.append(np.nan)

    

    accuracy_scores.append(accuracy)

    f1_scores.append(f1)

    rmse_scores.append(rmse)

    

    # Binary Classification Scatter Plot

    plt.figure()

    plt.scatter(val_labels, pred_probs[:, 1], alpha=0.5, color='blue', label="Predicted Probability")

    plt.xlabel("True Label")

    plt.ylabel("Predicted Probability")

    plt.title("Binary Classification Scatter Plot")

    plt.legend()

    plt.show()


# Output cross-validation results

print("Cross-Validation Results:")

print(f"Mean Accuracy: {np.nanmean(accuracy_scores):.4f}")

print(f"Mean F1 Score: {np.nanmean(f1_scores):.4f}")

print(f"Mean RMSE: {np.nanmean(rmse_scores):.4f}")

print(f"Mean AUC-ROC: {np.nanmean(auc_scores):.4f}")

print(f"Mean Precision-Recall AUC: {np.nanmean(pr_auc_scores):.4f}")


# Plot average ROC curve across folds

plt.figure()

mean_tpr = np.mean

# Plot average ROC curve across folds

plt.figure()

mean_tpr = np.mean(tprs, axis=0)

mean_auc = auc(mean_fpr, mean_tpr)

plt.plot(mean_fpr, mean_tpr, color='b', label=f'Average ROC (AUC={mean_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line for random classifier

plt.xlabel('False Positive Rate')

plt.ylabel('True Positive Rate')

plt.title('Average ROC Curve Across Folds')

plt.legend(loc='best')

plt.show()


Comments

Popular posts from this blog

"How to maintain or retain tabs in same tab after button click events or postback?" using JQuery in ASP.NET C#

In this post I'll share an details about " How to maintain or retain tabs in same tab after button click events or postback? " Step 1: you need to download Jquery and JQueryUI Javascript libraries from this site http://jqueryui.com/ Step 2: As usually you can create ASP.NET website from Visual Studio IDE and add Jquery and JqueryUI plugins in the header section of aspx page. Step 3: Add HiddenField control inside aspx page which is very useful to retain tab in same page Step 4: Use the HiddenField ID in Jquery code to indicate that CurrentTab Index Step 5: In code Behind, using Enumerations concept give the tab index values as user defined variable  Step 6: Use the Enum values in every Button click events on different tabs to check that tab could be retained in the same tab Further, Here I'll give the code details and snap shot pictures, 1. Default.aspx: Design Page First Second Third ...

Login and Registration forms in C# windows application with Back end Microsoft SQL Server for data access

In this article, I'm gonna share about how to make login and register form with MS SQL database; 1. Flow Chart Logic 2. Normal Features 3. Form Designs Login Form Design Sign in Form Design Password Retrieve Form 4. Database Design and SQL queries and Stored Procedure Create new Database as "schooldata" create table registerdata (  ID int identity,  Username nvarchar(100),  Password nvarchar(100),  Fullname  nvarchar(100),  MobileNO nvarchar(100),  EmailID nvarchar(100)  ) select * from registerdata create procedure regis (  @Username as nvarchar(100),  @Password as nvarchar(100),  @Fullname as nvarchar(100),  @MobileNO as nvarchar(100),  @EmailID as nvarchar(100)  ) as begin insert into registerdata (Username, Password, Fullname, MobileNO,EmailID) values (@Username, @Password, @Fullname, @MobileNO, @EmailID) ...

Guidewire Related Interview Question and answers part 1

common Guidewire questions and answers 20 Guidewire BC Q&A Top 100 Guidewire Interview FAQ Guidewire Claimcenter 20 Interview Questions Guidewire Rating concepts