Skip to main content

Multiple Ensemble to find best Hyperparameters

Below is a Python implementation of a machine learning pipeline class that supports LightGBM, XGBoost, CatBoost, and AdaBoost, using RandomizedSearchCV to find the best hyperparameters. The output includes the best hyperparameters for each model in JSON format.



Prerequisites

Install required libraries:

pip install lightgbm xgboost catboost scikit-learn pandas numpy

Code: Machine Learning Pipeline Class

import json
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

class MLBoostPipeline:
    def __init__(self, random_state=42, n_iter=20, cv=5):
        self.random_state = random_state
        self.n_iter = n_iter
        self.cv = cv
        self.models = {
            "LightGBM": LGBMClassifier(random_state=self.random_state),
            "XGBoost": XGBClassifier(random_state=self.random_state, use_label_encoder=False, eval_metric='logloss'),
            "CatBoost": CatBoostClassifier(random_state=self.random_state, verbose=0),
            "AdaBoost": AdaBoostClassifier(random_state=self.random_state)
        }
        self.param_grids = {
            "LightGBM": {
                "num_leaves": [20, 31, 40],
                "learning_rate": [0.01, 0.05, 0.1],
                "n_estimators": [50, 100, 200]
            },
            "XGBoost": {
                "n_estimators": [50, 100, 200],
                "max_depth": [3, 6, 10],
                "learning_rate": [0.01, 0.1, 0.2]
            },
            "CatBoost": {
                "iterations": [50, 100, 200],
                "learning_rate": [0.01, 0.1, 0.2],
                "depth": [4, 6, 10]
            },
            "AdaBoost": {
                "n_estimators": [50, 100, 200],
                "learning_rate": [0.5, 1.0, 1.5]
            }
        }
        self.best_params = {}

    def fit_and_tune(self, X_train, y_train):
        for model_name, model in self.models.items():
            print(f"Optimizing {model_name}...")
            param_grid = self.param_grids[model_name]
            search = RandomizedSearchCV(
                estimator=model,
                param_distributions=param_grid,
                n_iter=self.n_iter,
                cv=self.cv,
                scoring='accuracy',
                random_state=self.random_state,
                verbose=1,
                n_jobs=-1
            )
            search.fit(X_train, y_train)
            self.best_params[model_name] = search.best_params_
            print(f"Best parameters for {model_name}: {search.best_params_}")
        return self.best_params

    def evaluate_models(self, X_test, y_test):
        results = {}
        for model_name, model in self.models.items():
            print(f"Evaluating {model_name}...")
            model.set_params(**self.best_params[model_name])
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            results[model_name] = {
                "best_params": self.best_params[model_name],
                "accuracy": accuracy
            }
        return results

    def save_results(self, results, output_file="model_results.json"):
        with open(output_file, "w") as f:
            json.dump(results, f, indent=4)
        print(f"Results saved to {output_file}")

# Example usage
if __name__ == "__main__":
    # Replace with your dataset
    from sklearn.datasets import load_iris
    data = load_iris()
    X = data.data
    y = data.target

    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the pipeline
    pipeline = MLBoostPipeline(random_state=42, n_iter=10, cv=3)

    # Optimize hyperparameters
    best_params = pipeline.fit_and_tune(X_train, y_train)

    # Evaluate models
    results = pipeline.evaluate_models(X_test, y_test)

    # Save results to JSON
    pipeline.save_results(results)

Features of the Code

  1. Supported Models:

    • LightGBM
    • XGBoost
    • CatBoost
    • AdaBoost
  2. Hyperparameter Optimization:

    • Uses RandomizedSearchCV for hyperparameter tuning.
    • Configurable parameter grids for each model.
  3. Evaluation:

    • Evaluates models on test data.
    • Returns the accuracy and best hyperparameters.
  4. JSON Output:

    • Saves the results, including the best hyperparameters and test accuracies, in a JSON file.

Example Output (JSON Format)

{
    "LightGBM": {
        "best_params": {
            "num_leaves": 31,
            "learning_rate": 0.05,
            "n_estimators": 100
        },
        "accuracy": 0.9667
    },
    "XGBoost": {
        "best_params": {
            "n_estimators": 200,
            "max_depth": 6,
            "learning_rate": 0.1
        },
        "accuracy": 0.9667
    },
    "CatBoost": {
        "best_params": {
            "iterations": 200,
            "learning_rate": 0.1,
            "depth": 6
        },
        "accuracy": 0.9667
    },
    "AdaBoost": {
        "best_params": {
            "n_estimators": 100,
            "learning_rate": 1.0
        },
        "accuracy": 0.9333
    }
}

Customization Options

  1. Parameter Grids:

    • Update the self.param_grids dictionary with additional parameters or ranges.
  2. Scoring Metrics:

    • Change scoring='accuracy' to other metrics like f1, roc_auc, etc.
  3. Ensemble Methods:

    • Add more models or custom ensembles if needed.

Comments