import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# Create classification dataset
X, y = make_classification(
n_samples=1000,
n_features=20,
n_informative=15,
n_redundant=5,
random_state=42
)
# Split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Class distribution: {pd.Series(y_train).value_counts().to_dict()}")8 Model Selection & Hyperparameter Tuning
Finding the best model and its optimal parameters is crucial for performance.
8.1 The Workflow
- Choose candidate algorithms (Random Forest, SVM, etc.)
- Define hyperparameter search space
- Use cross-validation to evaluate each combination
- Select best model + parameters
- Evaluate on test set (once!)
8.2 Sample Dataset
For this chapter, we’ll use a synthetic classification dataset to demonstrate hyperparameter tuning techniques. This controlled dataset allows us to focus on the tuning process itself without getting distracted by data cleaning or feature engineering. The principles you learn here apply to any real-world dataset.
8.3 Grid Search
Concept: Exhaustively try all combinations of hyperparameters.
8.3.1 Basic Grid Search
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# Define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# Create model
rf = RandomForestClassifier(random_state=42)
# Grid search with cross-validation
grid_search = GridSearchCV(
estimator=rf,
param_grid=param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1,
verbose=1
)
# Fit (this will take a moment)
print("Starting grid search...")
grid_search.fit(X_train, y_train)
print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.3f}")8.3.2 Analyze Grid Search Results
# Convert results to DataFrame
results_df = pd.DataFrame(grid_search.cv_results_)
# Show top 10 configurations
top_results = results_df.nsmallest(10, 'rank_test_score')[
['params', 'mean_test_score', 'std_test_score', 'rank_test_score']
]
print("Top 10 configurations:")
print(top_results.to_string(index=False))
# Evaluate on test set
test_score = grid_search.best_estimator_.score(X_test, y_test)
print(f"\nTest set accuracy: {test_score:.3f}")8.3.3 Visualize Grid Search
import matplotlib.pyplot as plt
# Plot scores for specific hyperparameter
# Group by max_depth
depth_scores = results_df.groupby(
results_df['params'].apply(lambda x: x['max_depth'])
)['mean_test_score'].agg(['mean', 'std'])
plt.figure(figsize=(10, 6))
plt.errorbar(
depth_scores.index.astype(str),
depth_scores['mean'],
yerr=depth_scores['std'],
marker='o',
linewidth=2,
markersize=8,
capsize=5
)
plt.xlabel('max_depth')
plt.ylabel('Mean CV Accuracy')
plt.title('Grid Search: max_depth vs Accuracy')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()8.4 Randomized Search
Concept: Sample random combinations (faster than grid search).
Use when: Large search space or limited time.
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
# Define distributions to sample from
param_distributions = {
'n_estimators': randint(50, 300),
'max_depth': [5, 10, 15, 20, None],
'min_samples_split': randint(2, 20),
'min_samples_leaf': randint(1, 10),
'max_features': uniform(0.1, 0.9)
}
# Randomized search
random_search = RandomizedSearchCV(
estimator=RandomForestClassifier(random_state=42),
param_distributions=param_distributions,
n_iter=50, # Number of random combinations to try
cv=5,
scoring='accuracy',
n_jobs=-1,
verbose=1,
random_state=42
)
print("Starting randomized search...")
random_search.fit(X_train, y_train)
print(f"\nBest parameters: {random_search.best_params_}")
print(f"Best CV score: {random_search.best_score_:.3f}")
# Test set
test_score_rs = random_search.best_estimator_.score(X_test, y_test)
print(f"Test set accuracy: {test_score_rs:.3f}")8.5 Comparing Multiple Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
# Define models and their parameter grids
models = {
'Logistic Regression': {
'model': LogisticRegression(random_state=42, max_iter=1000),
'params': {
'C': [0.01, 0.1, 1, 10],
'penalty': ['l1', 'l2'],
'solver': ['liblinear']
}
},
'Random Forest': {
'model': RandomForestClassifier(random_state=42),
'params': {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5]
}
},
'SVM': {
'model': SVC(random_state=42),
'params': {
'C': [0.1, 1, 10],
'kernel': ['rbf', 'linear'],
'gamma': ['scale', 'auto']
}
},
'Gradient Boosting': {
'model': GradientBoostingClassifier(random_state=42),
'params': {
'n_estimators': [50, 100],
'learning_rate': [0.01, 0.1, 0.5],
'max_depth': [3, 5, 7]
}
}
}
# Search for best model
best_models = {}
for name, model_dict in models.items():
print(f"\nTuning {name}...")
grid = GridSearchCV(
estimator=model_dict['model'],
param_grid=model_dict['params'],
cv=3, # Faster for comparison
scoring='accuracy',
n_jobs=-1
)
grid.fit(X_train, y_train)
best_models[name] = {
'estimator': grid.best_estimator_,
'cv_score': grid.best_score_,
'params': grid.best_params_
}
print(f" Best CV score: {grid.best_score_:.3f}")8.5.1 Compare Best Models
# Evaluate all on test set
comparison = []
for name, model_info in best_models.items():
test_score = model_info['estimator'].score(X_test, y_test)
comparison.append({
'Model': name,
'CV Score': model_info['cv_score'],
'Test Score': test_score,
'Best Params': str(model_info['params'])
})
comparison_df = pd.DataFrame(comparison).sort_values('Test Score', ascending=False)
print("\nModel Comparison:")
print(comparison_df.to_string(index=False))
# Visualize
fig, ax = plt.subplots(figsize=(12, 6))
x = np.arange(len(comparison_df))
width = 0.35
ax.bar(x - width/2, comparison_df['CV Score'], width, label='CV Score', alpha=0.8)
ax.bar(x + width/2, comparison_df['Test Score'], width, label='Test Score', alpha=0.8)
ax.set_xlabel('Model')
ax.set_ylabel('Accuracy')
ax.set_title('Model Comparison: CV vs Test Score')
ax.set_xticks(x)
ax.set_xticklabels(comparison_df['Model'], rotation=45, ha='right')
ax.legend()
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()8.6 Advanced: Nested Cross-Validation
Problem: Using same data for tuning and evaluation can be optimistic.
Solution: Outer CV for evaluation, inner CV for tuning.
from sklearn.model_selection import cross_val_score
# Nested CV: Outer loop evaluates, inner loop tunes
def nested_cv(model, param_grid, X, y, outer_cv=5, inner_cv=3):
"""Perform nested cross-validation"""
outer_scores = []
for i in range(outer_cv):
# Create outer fold
X_train_outer, X_test_outer, y_train_outer, y_test_outer = train_test_split(
X, y, test_size=0.2, random_state=i, stratify=y
)
# Inner CV: tune on training data
grid = GridSearchCV(
estimator=model,
param_grid=param_grid,
cv=inner_cv,
scoring='accuracy'
)
grid.fit(X_train_outer, y_train_outer)
# Evaluate best model on outer test fold
score = grid.best_estimator_.score(X_test_outer, y_test_outer)
outer_scores.append(score)
return np.array(outer_scores)
# Run nested CV
param_grid_rf = {
'n_estimators': [50, 100],
'max_depth': [5, 10, 15]
}
nested_scores = nested_cv(
RandomForestClassifier(random_state=42),
param_grid_rf,
X, y,
outer_cv=5,
inner_cv=3
)
print(f"Nested CV scores: {nested_scores}")
print(f"Mean: {nested_scores.mean():.3f} (+/- {nested_scores.std():.3f})")8.7 Common Hyperparameters by Algorithm
8.7.1 Random Forest
n_estimators: Number of trees (50-500)max_depth: Tree depth (3-20 or None)min_samples_split: Min samples to split (2-20)min_samples_leaf: Min samples in leaf (1-10)max_features: Features per split (‘sqrt’, ‘log2’, or float)
8.7.2 Logistic Regression
C: Inverse regularization strength (0.001-100)penalty: ‘l1’, ‘l2’, or ‘elasticnet’solver: ‘liblinear’, ‘lbfgs’, ‘saga’
8.7.3 SVM
C: Regularization (0.1-100)kernel: ‘linear’, ‘rbf’, ‘poly’gamma: Kernel coefficient (‘scale’, ‘auto’, or float)
8.7.4 Gradient Boosting
n_estimators: Number of boosting stages (50-500)learning_rate: Shrinks contribution (0.01-0.5)max_depth: Tree depth (3-10)subsample: Fraction of samples (0.5-1.0)
8.8 Tips for Efficient Tuning
- Start coarse, then refine
- First: Wide range, few values
- Then: Narrow range around best value
- Use RandomizedSearchCV for initial exploration
- Faster than GridSearchCV
- Good for large search spaces
- Monitor for overfitting
- Compare train, CV, and test scores
- Large gaps indicate overfitting
- Use appropriate CV folds
- 5-fold is standard
- 10-fold for small datasets
- 3-fold for large datasets (faster)
- Parallelize with n_jobs=-1
- Uses all CPU cores
- Significant speedup
8.9 Scoring Metrics for Different Problems
# For classification
scoring_options = [
'accuracy', # Overall correctness
'precision', # Positive predictive value
'recall', # Sensitivity
'f1', # Harmonic mean of precision/recall
'roc_auc', # Area under ROC curve
]
# For regression
scoring_options_regression = [
'neg_mean_absolute_error', # MAE (negative because higher is better)
'neg_mean_squared_error', # MSE
'neg_root_mean_squared_error', # RMSE
'r2', # R² score
]
print("Classification scoring options:", scoring_options)
print("\nRegression scoring options:", scoring_options_regression)8.10 Summary
- Grid Search: Exhaustive, guaranteed to find best in grid
- Randomized Search: Faster, good for large spaces
- Nested CV: Unbiased performance estimate
- Compare multiple algorithms before deep tuning
- Start simple: Baseline → tune → ensemble
- Watch for overfitting: CV score >> test score is bad
- Use appropriate scoring metric for your problem
Next: Real-world considerations and production tips!