Below is a Python implementation of a machine learning pipeline class that supports LightGBM, XGBoost, CatBoost, and AdaBoost, using RandomizedSearchCV
to find the best hyperparameters. The output includes the best hyperparameters for each model in JSON format.
Prerequisites
Install required libraries:
pip install lightgbm xgboost catboost scikit-learn pandas numpy
Code: Machine Learning Pipeline Class
import json
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
class MLBoostPipeline:
def __init__(self, random_state=42, n_iter=20, cv=5):
self.random_state = random_state
self.n_iter = n_iter
self.cv = cv
self.models = {
"LightGBM": LGBMClassifier(random_state=self.random_state),
"XGBoost": XGBClassifier(random_state=self.random_state, use_label_encoder=False, eval_metric='logloss'),
"CatBoost": CatBoostClassifier(random_state=self.random_state, verbose=0),
"AdaBoost": AdaBoostClassifier(random_state=self.random_state)
}
self.param_grids = {
"LightGBM": {
"num_leaves": [20, 31, 40],
"learning_rate": [0.01, 0.05, 0.1],
"n_estimators": [50, 100, 200]
},
"XGBoost": {
"n_estimators": [50, 100, 200],
"max_depth": [3, 6, 10],
"learning_rate": [0.01, 0.1, 0.2]
},
"CatBoost": {
"iterations": [50, 100, 200],
"learning_rate": [0.01, 0.1, 0.2],
"depth": [4, 6, 10]
},
"AdaBoost": {
"n_estimators": [50, 100, 200],
"learning_rate": [0.5, 1.0, 1.5]
}
}
self.best_params = {}
def fit_and_tune(self, X_train, y_train):
for model_name, model in self.models.items():
print(f"Optimizing {model_name}...")
param_grid = self.param_grids[model_name]
search = RandomizedSearchCV(
estimator=model,
param_distributions=param_grid,
n_iter=self.n_iter,
cv=self.cv,
scoring='accuracy',
random_state=self.random_state,
verbose=1,
n_jobs=-1
)
search.fit(X_train, y_train)
self.best_params[model_name] = search.best_params_
print(f"Best parameters for {model_name}: {search.best_params_}")
return self.best_params
def evaluate_models(self, X_test, y_test):
results = {}
for model_name, model in self.models.items():
print(f"Evaluating {model_name}...")
model.set_params(**self.best_params[model_name])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
results[model_name] = {
"best_params": self.best_params[model_name],
"accuracy": accuracy
}
return results
def save_results(self, results, output_file="model_results.json"):
with open(output_file, "w") as f:
json.dump(results, f, indent=4)
print(f"Results saved to {output_file}")
# Example usage
if __name__ == "__main__":
# Replace with your dataset
from sklearn.datasets import load_iris
data = load_iris()
X = data.data
y = data.target
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize the pipeline
pipeline = MLBoostPipeline(random_state=42, n_iter=10, cv=3)
# Optimize hyperparameters
best_params = pipeline.fit_and_tune(X_train, y_train)
# Evaluate models
results = pipeline.evaluate_models(X_test, y_test)
# Save results to JSON
pipeline.save_results(results)
Features of the Code
-
Supported Models:
- LightGBM
- XGBoost
- CatBoost
- AdaBoost
-
Hyperparameter Optimization:
- Uses
RandomizedSearchCV
for hyperparameter tuning. - Configurable parameter grids for each model.
- Uses
-
Evaluation:
- Evaluates models on test data.
- Returns the accuracy and best hyperparameters.
-
JSON Output:
- Saves the results, including the best hyperparameters and test accuracies, in a JSON file.
Example Output (JSON Format)
{
"LightGBM": {
"best_params": {
"num_leaves": 31,
"learning_rate": 0.05,
"n_estimators": 100
},
"accuracy": 0.9667
},
"XGBoost": {
"best_params": {
"n_estimators": 200,
"max_depth": 6,
"learning_rate": 0.1
},
"accuracy": 0.9667
},
"CatBoost": {
"best_params": {
"iterations": 200,
"learning_rate": 0.1,
"depth": 6
},
"accuracy": 0.9667
},
"AdaBoost": {
"best_params": {
"n_estimators": 100,
"learning_rate": 1.0
},
"accuracy": 0.9333
}
}
Customization Options
-
Parameter Grids:
- Update the
self.param_grids
dictionary with additional parameters or ranges.
- Update the
-
Scoring Metrics:
- Change
scoring='accuracy'
to other metrics likef1
,roc_auc
, etc.
- Change
-
Ensemble Methods:
- Add more models or custom ensembles if needed.
Comments
Post a Comment
Share this to your friends