LLM KRL English to Tamil Version 1 - Bert model used
In Kaggle Notebook Published Here
import pandas as pd
import torch
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, mean_squared_error, roc_auc_score, precision_recall_curve, auc, roc_curve
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import numpy as np
import matplotlib.pyplot as plt
import random
# Expand dataset with 100 samples by duplicating and adding slight variations
sample_data = {
"text_english": [
"I am very happy to meet you", "I am disappointed with your work",
"You have done an excellent job, well done", "This is not good, I expected better",
"Thank you very much for your support", "I don't like your attitude",
"I'm grateful for your guidance", "Your work lacks quality",
"Well done, you've made us proud", "I appreciate your effort",
"You are an inspiration", "I regret working with you"
],
"text_tamil": [
"உங்களை சந்திப்பதில் மிகவும் மகிழ்ச்சி", "உங்கள் வேலைக்கு நான் வருத்தப்படுகிறேன்",
"நீங்கள் சிறந்த வேலை செய்தீர்கள், நல்லது", "இது நல்லதல்ல, நான் நல்லவை எதிர்பார்த்தேன்",
"உங்கள் உதவிக்காக மிகவும் நன்றி", "உங்கள் அணுகுமுறை எனக்கு பிடிக்கவில்லை",
"உங்கள் வழிகாட்டலுக்கு நான் நன்றி கூறுகிறேன்", "உங்கள் வேலை தரம் குறைவாக உள்ளது",
"நல்ல செய்தி, நீங்கள் எங்களை பெருமைப்படுத்தினீர்கள்", "உங்கள் முயற்சியை நான் பாராட்டுகிறேன்",
"நீங்கள் ஒரு பேரனுபவம்", "உங்களுடன் பணியாற்றியது வருத்தமாக உள்ளது"
],
"respect": [
"respect", None, "respect", None, "respect",
None, "respect", None, "respect", "respect",
"respect", None
]
}
# Generate 100 samples by duplicating and varying the sentences
def generate_data(sample_data, n_samples=100):
expanded_data = {"text_english": [], "text_tamil": [], "respect": []}
for _ in range(n_samples // len(sample_data["text_english"])):
for i in range(len(sample_data["text_english"])):
english_text = sample_data["text_english"][i] + (", thank you!" if random.random() > 0.5 else "")
tamil_text = sample_data["text_tamil"][i]
respect_label = sample_data["respect"][i]
expanded_data["text_english"].append(english_text)
expanded_data["text_tamil"].append(tamil_text)
expanded_data["respect"].append(respect_label)
return expanded_data
data = generate_data(sample_data, 100)
df = pd.DataFrame(data)
# Prepare input dataset with ID and respect classification
input_dataset = pd.DataFrame({
"ID": range(1, len(df) + 1),
"LanguageSource": df['text_english'],
"Language_Translated": df['text_tamil'],
"RespectContext": df['respect'].apply(lambda x: 1 if x == 'respect' else 0)
})
# Combine text features for model input
input_dataset['combined_text'] = input_dataset['LanguageSource'] + " " + input_dataset['Language_Translated']
# Define function to convert text data to dataset for transformers
class RespectDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
# Define training arguments with Early Stopping
training_args = TrainingArguments(
output_dir='./results',
eval_strategy="epoch", # Updated for consistency with save strategy
save_strategy="epoch", # Ensure saving happens at the end of each epoch
num_train_epochs=10,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
logging_dir='./logs',
logging_steps=10,
load_best_model_at_end=True # Keep the best model based on evaluation metric
)
# Early stopping callback
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)
# K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores, f1_scores, rmse_scores, auc_scores, pr_auc_scores = [], [], [], [], []
mean_fpr = np.linspace(0, 1, 100) # For averaging ROC curve across folds
tprs = []
for train_index, val_index in kf.split(input_dataset):
train_texts = input_dataset.iloc[train_index]['combined_text'].tolist()
val_texts = input_dataset.iloc[val_index]['combined_text'].tolist()
train_labels = input_dataset.iloc[train_index]['RespectContext'].tolist()
val_labels = input_dataset.iloc[val_index]['RespectContext'].tolist()
# Tokenize the data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)
# Create datasets
train_dataset = RespectDataset(train_encodings, train_labels)
val_dataset = RespectDataset(val_encodings, val_labels)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
callbacks=[early_stopping]
)
# Train and evaluate the model
trainer.train()
predictions = trainer.predict(val_dataset)
pred_probs = torch.nn.functional.softmax(torch.tensor(predictions.predictions), dim=-1).numpy()
pred_labels = np.argmax(pred_probs, axis=1)
# Calculate metrics
accuracy = accuracy_score(val_labels, pred_labels)
f1 = f1_score(val_labels, pred_labels)
rmse = np.sqrt(mean_squared_error(val_labels, pred_labels))
if len(np.unique(val_labels)) == 2:
auc_score = roc_auc_score(val_labels, pred_probs[:, 1])
precision, recall, _ = precision_recall_curve(val_labels, pred_probs[:, 1])
pr_auc = auc(recall, precision)
fpr, tpr, _ = roc_curve(val_labels, pred_probs[:, 1])
# Interpolate to mean_fpr for average ROC curve
tprs.append(np.interp(mean_fpr, fpr, tpr))
auc_scores.append(auc_score)
pr_auc_scores.append(pr_auc)
# Plot ROC Curve for each fold
plt.plot(fpr, tpr, alpha=0.3, label=f'Fold ROC AUC={auc_score:.2f}')
else:
auc_scores.append(np.nan)
pr_auc_scores.append(np.nan)
accuracy_scores.append(accuracy)
f1_scores.append(f1)
rmse_scores.append(rmse)
# Binary Classification Scatter Plot
plt.figure()
plt.scatter(val_labels, pred_probs[:, 1], alpha=0.5, color='blue', label="Predicted Probability")
plt.xlabel("True Label")
plt.ylabel("Predicted Probability")
plt.title("Binary Classification Scatter Plot")
plt.legend()
plt.show()
# Output cross-validation results
print("Cross-Validation Results:")
print(f"Mean Accuracy: {np.nanmean(accuracy_scores):.4f}")
print(f"Mean F1 Score: {np.nanmean(f1_scores):.4f}")
print(f"Mean RMSE: {np.nanmean(rmse_scores):.4f}")
print(f"Mean AUC-ROC: {np.nanmean(auc_scores):.4f}")
print(f"Mean Precision-Recall AUC: {np.nanmean(pr_auc_scores):.4f}")
# Plot average ROC curve across folds
plt.figure()
mean_tpr = np.mean
# Plot average ROC curve across folds
plt.figure()
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='b', label=f'Average ROC (AUC={mean_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--') # Diagonal line for random classifier
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Average ROC Curve Across Folds')
plt.legend(loc='best')
plt.show()
Comments
Post a Comment
Share this to your friends