# Your code here
# Hint: from sklearn.datasets import load_iris11 Practice Exercises
This chapter contains hands-on exercises covering chapters 1-10. Each exercise includes the problem, starter code, and a hidden solution that you can reveal to check your work.
11.1 Exercise 1: Data Loading & Exploration
Task: Load the Iris flower dataset from sklearn, convert it to a pandas DataFrame, and perform basic exploration.
Requirements: 1. Load the Iris dataset 2. Create a DataFrame with proper column names 3. Display the first 5 rows 4. Check for missing values 5. Display basic statistics
Show Solution
from sklearn.datasets import load_iris
import pandas as pd
# Load dataset
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
# Display first 5 rows
print("First 5 rows:")
print(df.head())
# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())
# Basic statistics
print("\nBasic statistics:")
print(df.describe())11.2 Exercise 2: Train-Test Split & Scaling
Task: Using the Iris dataset, split the data and apply standard scaling.
Requirements: 1. Split data into 70% train, 30% test (random_state=42) 2. Apply StandardScaler to features only (not target) 3. Print shapes of all splits 4. Verify that scaling worked (mean≈0, std≈1)
# Your code here
# Hint: Use train_test_split and StandardScalerShow Solution
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
# Prepare data
X = iris.data
y = iris.target
# Split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42, stratify=y
)
# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Print shapes
print(f"X_train shape: {X_train_scaled.shape}")
print(f"X_test shape: {X_test_scaled.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")
# Verify scaling
print(f"\nTrain set - Mean: {X_train_scaled.mean():.4f}, Std: {X_train_scaled.std():.4f}")
print(f"Test set - Mean: {X_test_scaled.mean():.4f}, Std: {X_test_scaled.std():.4f}")11.3 Exercise 3: Logistic Regression Classifier
Task: Train a logistic regression model and evaluate its performance.
Requirements: 1. Train a LogisticRegression model (max_iter=1000) 2. Make predictions on test set 3. Calculate accuracy, precision, recall, and F1-score 4. Display confusion matrix
# Your code here
# Hint: Use LogisticRegression and classification metricsShow Solution
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
# Train model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)
# Predictions
y_pred = log_reg.predict(X_test_scaled)
# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-Score: {f1:.3f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))11.4 Exercise 4: Cross-Validation Comparison
Task: Compare multiple classifiers using cross-validation.
Requirements: 1. Compare: LogisticRegression, DecisionTreeClassifier, RandomForestClassifier 2. Use 5-fold cross-validation 3. Calculate mean and std of accuracy for each model 4. Visualize results with a bar plot showing error bars
# Your code here
# Hint: Use cross_val_scoreShow Solution
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
# Define models
models = {
'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=42),
'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
}
# Cross-validation
cv_results = {}
for name, model in models.items():
scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
cv_results[name] = {
'mean': scores.mean(),
'std': scores.std()
}
print(f"{name}: {scores.mean():.3f} (+/- {scores.std():.3f})")
# Visualization
fig, ax = plt.subplots(figsize=(10, 6))
names = list(cv_results.keys())
means = [cv_results[name]['mean'] for name in names]
stds = [cv_results[name]['std'] for name in names]
ax.bar(names, means, yerr=stds, capsize=5, alpha=0.7, color=['blue', 'green', 'red'])
ax.set_ylabel('Accuracy')
ax.set_title('Model Comparison with 5-Fold Cross-Validation')
ax.set_ylim([0.8, 1.0])
ax.axhline(y=0.9, color='gray', linestyle='--', alpha=0.5)
plt.xticks(rotation=15, ha='right')
plt.tight_layout()
plt.show()11.5 Exercise 5: Hyperparameter Tuning with GridSearchCV
Task: Tune a Random Forest classifier using GridSearchCV.
Requirements: 1. Tune: n_estimators [50, 100, 200], max_depth [3, 5, 7, 10], min_samples_split [2, 5, 10] 2. Use 3-fold CV 3. Find best parameters and best score 4. Evaluate best model on test set
# Your code here
# Hint: Use GridSearchCVShow Solution
from sklearn.model_selection import GridSearchCV
# Parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 7, 10],
'min_samples_split': [2, 5, 10]
}
# GridSearch
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
rf, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1
)
print("Running GridSearchCV...")
grid_search.fit(X_train_scaled, y_train)
# Best parameters
print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.3f}")
# Evaluate on test set
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test_scaled, y_test)
print(f"Test accuracy: {test_accuracy:.3f}")11.6 Exercise 6: K-Means Clustering
Task: Apply K-Means clustering to the Iris dataset (unsupervised).
Requirements: 1. Use K-Means with k=3 clusters 2. Compare predicted clusters to actual labels using adjusted_rand_score 3. Visualize clusters using first 2 principal components 4. Color points by predicted clusters
# Your code here
# Hint: Use KMeans and PCA for visualizationShow Solution
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import adjusted_rand_score
import matplotlib.pyplot as plt
# Standardize full dataset
X_scaled = StandardScaler().fit_transform(iris.data)
# K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)
# Compare to actual labels
ari = adjusted_rand_score(iris.target, clusters)
print(f"Adjusted Rand Index: {ari:.3f}")
# PCA for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
# Predicted clusters
ax1.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', alpha=0.6)
ax1.scatter(kmeans.cluster_centers_[:, :2][:, 0],
kmeans.cluster_centers_[:, :2][:, 1],
c='red', marker='X', s=200, edgecolors='black', label='Centroids')
ax1.set_xlabel('First Principal Component')
ax1.set_ylabel('Second Principal Component')
ax1.set_title('K-Means Clustering (Predicted)')
ax1.legend()
# Actual labels
scatter = ax2.scatter(X_pca[:, 0], X_pca[:, 1], c=iris.target, cmap='viridis', alpha=0.6)
ax2.set_xlabel('First Principal Component')
ax2.set_ylabel('Second Principal Component')
ax2.set_title('Actual Iris Species')
plt.colorbar(scatter, ax=ax2)
plt.tight_layout()
plt.show()
print(f"\nExplained variance by 2 components: {pca.explained_variance_ratio_.sum():.2%}")11.7 Exercise 7: Feature Engineering & Pipelines
Task: Create a complete ML pipeline with feature engineering.
Requirements: 1. Load the Boston housing dataset (or use California housing) 2. Create new features: total_rooms_per_household, bedrooms_per_room (if applicable) 3. Build a pipeline: StandardScaler → LinearRegression 4. Evaluate using cross_val_score with neg_mean_squared_error 5. Calculate RMSE and R²
# Your code here
# Hint: Use Pipeline and make_pipelineShow Solution
from sklearn.datasets import fetch_california_housing
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
# Load dataset
housing = fetch_california_housing()
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = housing.target
# Feature engineering
X['rooms_per_household'] = X['AveRooms']
X['bedrooms_per_room'] = X['AveBedrms'] / X['AveRooms']
X['population_per_household'] = X['Population'] / X['HouseAge']
print("Features after engineering:")
print(X.columns.tolist())
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Create pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),
('regressor', LinearRegression())
])
# Train
pipeline.fit(X_train, y_train)
# Cross-validation
cv_scores = cross_val_score(
pipeline, X_train, y_train,
cv=5, scoring='neg_mean_squared_error'
)
cv_rmse = np.sqrt(-cv_scores)
print(f"\nCV RMSE: {cv_rmse.mean():.3f} (+/- {cv_rmse.std():.3f})")
# Test evaluation
y_pred = pipeline.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
test_r2 = r2_score(y_test, y_pred)
print(f"\nTest RMSE: {test_rmse:.3f}")
print(f"Test R²: {test_r2:.3f}")11.8 Exercise 8: Handling Imbalanced Data
Task: Create an imbalanced dataset and apply techniques to handle it.
Requirements: 1. Create imbalanced binary classification dataset (class ratio 1:9) 2. Train baseline model and note performance 3. Apply SMOTE to balance the dataset 4. Train new model on balanced data 5. Compare precision, recall, F1 for minority class
# Your code here
# Hint: Use make_classification with weights parameter
# Note: You'll need to install imbalanced-learn: pip install imbalanced-learnShow Solution
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report
# Create imbalanced dataset
X_imb, y_imb = make_classification(
n_samples=1000,
n_features=20,
n_informative=15,
n_redundant=5,
weights=[0.9, 0.1], # 90% class 0, 10% class 1
random_state=42
)
print(f"Class distribution: {np.bincount(y_imb)}")
# Split
X_train_imb, X_test_imb, y_train_imb, y_test_imb = train_test_split(
X_imb, y_imb, test_size=0.3, random_state=42, stratify=y_imb
)
# Baseline model
baseline_model = LogisticRegression(random_state=42)
baseline_model.fit(X_train_imb, y_train_imb)
y_pred_baseline = baseline_model.predict(X_test_imb)
print("\n=== Baseline Model (Imbalanced) ===")
print(classification_report(y_test_imb, y_pred_baseline))
# Apply SMOTE (if available)
try:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_imb, y_train_imb)
print(f"\nAfter SMOTE: {np.bincount(y_train_balanced)}")
# Train on balanced data
balanced_model = LogisticRegression(random_state=42)
balanced_model.fit(X_train_balanced, y_train_balanced)
y_pred_balanced = balanced_model.predict(X_test_imb)
print("\n=== Model with SMOTE (Balanced) ===")
print(classification_report(y_test_imb, y_pred_balanced))
except ImportError:
print("\nSMOTE not available. Install with: pip install imbalanced-learn")
print("Alternative: Use class_weight='balanced' in LogisticRegression")
balanced_model = LogisticRegression(class_weight='balanced', random_state=42)
balanced_model.fit(X_train_imb, y_train_imb)
y_pred_balanced = balanced_model.predict(X_test_imb)
print("\n=== Model with class_weight='balanced' ===")
print(classification_report(y_test_imb, y_pred_balanced))11.9 Exercise 9: Feature Importance & Selection
Task: Identify and select the most important features.
Requirements: 1. Use the Iris dataset with all features 2. Train a Random Forest and extract feature importances 3. Plot feature importances (sorted) 4. Select top 2 features and retrain model 5. Compare performance: all features vs top 2 features
# Your code here
# Hint: Use feature_importances_ attributeShow Solution
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
# Use Iris dataset (already loaded from Exercise 1)
# Create fresh train/test split for this exercise
X_iris = iris.data
y_iris = iris.target
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
X_iris, y_iris, test_size=0.3, random_state=42, stratify=y_iris
)
# Scale
scaler_iris = StandardScaler()
X_train_iris_scaled = scaler_iris.fit_transform(X_train_iris)
X_test_iris_scaled = scaler_iris.transform(X_test_iris)
# Train Random Forest with all features
rf_full = RandomForestClassifier(n_estimators=100, random_state=42)
rf_full.fit(X_train_iris_scaled, y_train_iris)
# Get feature importances
importances = rf_full.feature_importances_
feature_names = iris.feature_names
indices = np.argsort(importances)[::-1]
# Plot
plt.figure(figsize=(10, 6))
plt.bar(range(len(importances)), importances[indices])
plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=45, ha='right')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance - Random Forest (Iris Dataset)')
plt.tight_layout()
plt.show()
# Print all features
print("Feature importances:")
for i in range(len(indices)):
print(f"{i+1}. {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")
# Select top 2 features
top_2_indices = indices[:2]
X_train_top2 = X_train_iris_scaled[:, top_2_indices]
X_test_top2 = X_test_iris_scaled[:, top_2_indices]
# Train model with top 2 features
rf_top2 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_top2.fit(X_train_top2, y_train_iris)
# Compare performance
acc_full = rf_full.score(X_test_iris_scaled, y_test_iris)
acc_top2 = rf_top2.score(X_test_top2, y_test_iris)
print(f"\nAccuracy with all features: {acc_full:.3f}")
print(f"Accuracy with top 2 features: {acc_top2:.3f}")
print(f"Difference: {acc_full - acc_top2:.3f}")11.10 Exercise 10: Complete ML Project
Task: Build a complete ML solution from scratch.
Requirements: 1. Load digits dataset (image classification) 2. Perform EDA: check shapes, visualize samples 3. Split data and apply preprocessing 4. Train multiple models and compare 5. Select best model and tune hyperparameters 6. Create final evaluation report with confusion matrix and classification report 7. Visualize some predictions (correct and incorrect)
# Your code here
# This is a comprehensive exercise - take your time!Show Solution
from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
print("=== DIGITS CLASSIFICATION PROJECT ===\n")
# 1. Load data
digits = load_digits()
X = digits.data
y = digits.target
print(f"Dataset shape: {X.shape}")
print(f"Number of classes: {len(np.unique(y))}")
print(f"Image size: {digits.images[0].shape}")
# 2. EDA - Visualize samples
fig, axes = plt.subplots(2, 5, figsize=(12, 5))
for i, ax in enumerate(axes.flat):
ax.imshow(digits.images[i], cmap='gray')
ax.set_title(f'Label: {digits.target[i]}')
ax.axis('off')
plt.suptitle('Sample Digits from Dataset')
plt.tight_layout()
plt.show()
# 3. Split and preprocess
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 4. Compare models
print("\n=== Model Comparison ===")
models = {
'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
'SVM': SVC(kernel='rbf', random_state=42)
}
best_score = 0
best_model_name = None
for name, model in models.items():
model.fit(X_train_scaled, y_train)
score = model.score(X_test_scaled, y_test)
print(f"{name}: {score:.3f}")
if score > best_score:
best_score = score
best_model_name = name
print(f"\nBest model: {best_model_name}")
# 5. Tune best model (SVM)
print("\n=== Hyperparameter Tuning (SVM) ===")
param_grid = {
'C': [0.1, 1, 10],
'gamma': [0.001, 0.01, 0.1],
'kernel': ['rbf']
}
grid = GridSearchCV(SVC(random_state=42), param_grid, cv=3, n_jobs=-1, verbose=1)
grid.fit(X_train_scaled, y_train)
print(f"\nBest parameters: {grid.best_params_}")
print(f"Best CV score: {grid.best_score_:.3f}")
# 6. Final evaluation
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test_scaled)
print("\n=== Final Evaluation ===")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=digits.target_names)
fig, ax = plt.subplots(figsize=(10, 8))
disp.plot(ax=ax, cmap='Blues', values_format='d')
plt.title('Confusion Matrix - Digits Classification')
plt.tight_layout()
plt.show()
# 7. Visualize predictions
print("\n=== Sample Predictions ===")
sample_indices = np.random.choice(len(X_test), 10, replace=False)
fig, axes = plt.subplots(2, 5, figsize=(12, 5))
for idx, ax in zip(sample_indices, axes.flat):
image = X_test[idx].reshape(8, 8)
pred = y_pred[idx]
true = y_test[idx]
ax.imshow(image, cmap='gray')
color = 'green' if pred == true else 'red'
ax.set_title(f'True: {true}, Pred: {pred}', color=color)
ax.axis('off')
plt.suptitle('Sample Predictions (Green=Correct, Red=Incorrect)')
plt.tight_layout()
plt.show()
# Find misclassified examples
misclassified = np.where(y_pred != y_test)[0]
print(f"\nNumber of misclassified samples: {len(misclassified)} / {len(y_test)}")11.11 Summary
You’ve completed exercises covering:
- ✅ Data loading and exploration
- ✅ Data preprocessing and scaling
- ✅ Classification and regression models
- ✅ Cross-validation and model comparison
- ✅ Hyperparameter tuning
- ✅ Clustering and dimensionality reduction
- ✅ Feature engineering and pipelines
- ✅ Handling imbalanced data
- ✅ Feature importance and selection
- ✅ Complete end-to-end ML project
Next Steps: - Try these exercises with different datasets - Modify parameters and observe the effects - Combine techniques from multiple exercises - Apply these skills to your own projects
Challenge: Create your own dataset or find one on Kaggle and build a complete ML pipeline from scratch!