LLM KRL English to Tamil Version 1

LLM KRL English to Tamil Version 1 - Bert model used

import pandas as pd
import torch
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, mean_squared_error, roc_auc_score, precision_recall_curve, auc, roc_curve
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import numpy as np
import matplotlib.pyplot as plt
import random

# Expand dataset with 100 samples by duplicating and adding slight variations
sample_data = {
"text_english": [
"I am very happy to meet you", "I am disappointed with your work",
"You have done an excellent job, well done", "This is not good, I expected better",
"Thank you very much for your support", "I don't like your attitude",
"I'm grateful for your guidance", "Your work lacks quality",
"Well done, you've made us proud", "I appreciate your effort",
"You are an inspiration", "I regret working with you"
],
"text_tamil": [
"உங்களை சந்திப்பதில் மிகவும் மகிழ்ச்சி", "உங்கள் வேலைக்கு நான் வருத்தப்படுகிறேன்",
"நீங்கள் சிறந்த வேலை செய்தீர்கள், நல்லது", "இது நல்லதல்ல, நான் நல்லவை எதிர்பார்த்தேன்",
"உங்கள் உதவிக்காக மிகவும் நன்றி", "உங்கள் அணுகுமுறை எனக்கு பிடிக்கவில்லை",
"உங்கள் வழிகாட்டலுக்கு நான் நன்றி கூறுகிறேன்", "உங்கள் வேலை தரம் குறைவாக உள்ளது",
"நல்ல செய்தி, நீங்கள் எங்களை பெருமைப்படுத்தினீர்கள்", "உங்கள் முயற்சியை நான் பாராட்டுகிறேன்",
"நீங்கள் ஒரு பேரனுபவம்", "உங்களுடன் பணியாற்றியது வருத்தமாக உள்ளது"
],
"respect": [
"respect", None, "respect", None, "respect",
None, "respect", None, "respect", "respect",
"respect", None
]
}

# Generate 100 samples by duplicating and varying the sentences
def generate_data(sample_data, n_samples=100):
expanded_data = {"text_english": [], "text_tamil": [], "respect": []}
for _ in range(n_samples // len(sample_data["text_english"])):
for i in range(len(sample_data["text_english"])):
english_text = sample_data["text_english"][i] + (", thank you!" if random.random() > 0.5 else "")
tamil_text = sample_data["text_tamil"][i]
respect_label = sample_data["respect"][i]
expanded_data["text_english"].append(english_text)
expanded_data["text_tamil"].append(tamil_text)
expanded_data["respect"].append(respect_label)
return expanded_data

data = generate_data(sample_data, 100)
df = pd.DataFrame(data)

# Prepare input dataset with ID and respect classification
input_dataset = pd.DataFrame({
"ID": range(1, len(df) + 1),
"LanguageSource": df['text_english'],
"Language_Translated": df['text_tamil'],
"RespectContext": df['respect'].apply(lambda x: 1 if x == 'respect' else 0)
})

# Combine text features for model input
input_dataset['combined_text'] = input_dataset['LanguageSource'] + " " + input_dataset['Language_Translated']

# Define function to convert text data to dataset for transformers
class RespectDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels

def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item

def __len__(self):
return len(self.labels)

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define training arguments with Early Stopping
training_args = TrainingArguments(
output_dir='./results',
eval_strategy="epoch", # Updated for consistency with save strategy
save_strategy="epoch", # Ensure saving happens at the end of each epoch
num_train_epochs=10,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
logging_dir='./logs',
logging_steps=10,
load_best_model_at_end=True # Keep the best model based on evaluation metric
)

# Early stopping callback
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

# K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores, f1_scores, rmse_scores, auc_scores, pr_auc_scores = [], [], [], [], []
mean_fpr = np.linspace(0, 1, 100) # For averaging ROC curve across folds
tprs = []

for train_index, val_index in kf.split(input_dataset):
train_texts = input_dataset.iloc[train_index]['combined_text'].tolist()
val_texts = input_dataset.iloc[val_index]['combined_text'].tolist()
train_labels = input_dataset.iloc[train_index]['RespectContext'].tolist()
val_labels = input_dataset.iloc[val_index]['RespectContext'].tolist()

# Tokenize the data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

# Create datasets
train_dataset = RespectDataset(train_encodings, train_labels)
val_dataset = RespectDataset(val_encodings, val_labels)

# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
callbacks=[early_stopping]
)

# Train and evaluate the model
trainer.train()
predictions = trainer.predict(val_dataset)
pred_probs = torch.nn.functional.softmax(torch.tensor(predictions.predictions), dim=-1).numpy()
pred_labels = np.argmax(pred_probs, axis=1)

# Calculate metrics
accuracy = accuracy_score(val_labels, pred_labels)
f1 = f1_score(val_labels, pred_labels)
rmse = np.sqrt(mean_squared_error(val_labels, pred_labels))

if len(np.unique(val_labels)) == 2:
auc_score = roc_auc_score(val_labels, pred_probs[:, 1])
precision, recall, _ = precision_recall_curve(val_labels, pred_probs[:, 1])
pr_auc = auc(recall, precision)
fpr, tpr, _ = roc_curve(val_labels, pred_probs[:, 1])

# Interpolate to mean_fpr for average ROC curve
tprs.append(np.interp(mean_fpr, fpr, tpr))
auc_scores.append(auc_score)
pr_auc_scores.append(pr_auc)

# Plot ROC Curve for each fold
plt.plot(fpr, tpr, alpha=0.3, label=f'Fold ROC AUC={auc_score:.2f}')
else:
auc_scores.append(np.nan)
pr_auc_scores.append(np.nan)

accuracy_scores.append(accuracy)
f1_scores.append(f1)
rmse_scores.append(rmse)

# Binary Classification Scatter Plot
plt.figure()
plt.scatter(val_labels, pred_probs[:, 1], alpha=0.5, color='blue', label="Predicted Probability")
plt.xlabel("True Label")
plt.ylabel("Predicted Probability")
plt.title("Binary Classification Scatter Plot")
plt.legend()
plt.show()

# Output cross-validation results
print("Cross-Validation Results:")
print(f"Mean Accuracy: {np.nanmean(accuracy_scores):.4f}")
print(f"Mean F1 Score: {np.nanmean(f1_scores):.4f}")
print(f"Mean RMSE: {np.nanmean(rmse_scores):.4f}")
print(f"Mean AUC-ROC: {np.nanmean(auc_scores):.4f}")
print(f"Mean Precision-Recall AUC: {np.nanmean(pr_auc_scores):.4f}")

# Plot average ROC curve across folds
plt.figure()
mean_tpr = np.mean
# Plot average ROC curve across folds
plt.figure()
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='b', label=f'Average ROC (AUC={mean_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--') # Diagonal line for random classifier
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Average ROC Curve Across Folds')
plt.legend(loc='best')
plt.show()

"How to maintain or retain tabs in same tab after button click events or postback?" using JQuery in ASP.NET C#

In this post I'll share an details about " How to maintain or retain tabs in same tab after button click events or postback? " Step 1: you need to download Jquery and JQueryUI Javascript libraries from this site http://jqueryui.com/ Step 2: As usually you can create ASP.NET website from Visual Studio IDE and add Jquery and JqueryUI plugins in the header section of aspx page. Step 3: Add HiddenField control inside aspx page which is very useful to retain tab in same page Step 4: Use the HiddenField ID in Jquery code to indicate that CurrentTab Index Step 5: In code Behind, using Enumerations concept give the tab index values as user defined variable Step 6: Use the Enum values in every Button click events on different tabs to check that tab could be retained in the same tab Further, Here I'll give the code details and snap shot pictures, 1. Default.aspx: Design Page First Second Third ...

Kumaran1987

Search This Blog

LLM KRL English to Tamil Version 1 - Bert model used

Labels

Comments

Post a Comment

Popular posts from this blog

"How to maintain or retain tabs in same tab after button click events or postback?" using JQuery in ASP.NET C#

Login and Registration forms in C# windows application with Back end Microsoft SQL Server for data access

JSP and Servlet Form Submission without page refresh show results on the same page using Jquery AJAX