11  Practice Exercises

This chapter contains hands-on exercises covering chapters 1-10. Each exercise includes the problem, starter code, and a hidden solution that you can reveal to check your work.

11.1 Exercise 1: Data Loading & Exploration

Task: Load the Iris flower dataset from sklearn, convert it to a pandas DataFrame, and perform basic exploration.

Requirements: 1. Load the Iris dataset 2. Create a DataFrame with proper column names 3. Display the first 5 rows 4. Check for missing values 5. Display basic statistics

# Your code here
# Hint: from sklearn.datasets import load_iris
Show Solution
from sklearn.datasets import load_iris
import pandas as pd

# Load dataset
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target

# Display first 5 rows
print("First 5 rows:")
print(df.head())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Basic statistics
print("\nBasic statistics:")
print(df.describe())

11.2 Exercise 2: Train-Test Split & Scaling

Task: Using the Iris dataset, split the data and apply standard scaling.

Requirements: 1. Split data into 70% train, 30% test (random_state=42) 2. Apply StandardScaler to features only (not target) 3. Print shapes of all splits 4. Verify that scaling worked (mean≈0, std≈1)

# Your code here
# Hint: Use train_test_split and StandardScaler
Show Solution
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# Prepare data
X = iris.data
y = iris.target

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Print shapes
print(f"X_train shape: {X_train_scaled.shape}")
print(f"X_test shape: {X_test_scaled.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# Verify scaling
print(f"\nTrain set - Mean: {X_train_scaled.mean():.4f}, Std: {X_train_scaled.std():.4f}")
print(f"Test set - Mean: {X_test_scaled.mean():.4f}, Std: {X_test_scaled.std():.4f}")

11.3 Exercise 3: Logistic Regression Classifier

Task: Train a logistic regression model and evaluate its performance.

Requirements: 1. Train a LogisticRegression model (max_iter=1000) 2. Make predictions on test set 3. Calculate accuracy, precision, recall, and F1-score 4. Display confusion matrix

# Your code here
# Hint: Use LogisticRegression and classification metrics
Show Solution
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report

# Train model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)

# Predictions
y_pred = log_reg.predict(X_test_scaled)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-Score: {f1:.3f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

11.4 Exercise 4: Cross-Validation Comparison

Task: Compare multiple classifiers using cross-validation.

Requirements: 1. Compare: LogisticRegression, DecisionTreeClassifier, RandomForestClassifier 2. Use 5-fold cross-validation 3. Calculate mean and std of accuracy for each model 4. Visualize results with a bar plot showing error bars

# Your code here
# Hint: Use cross_val_score
Show Solution
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
}

# Cross-validation
cv_results = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    cv_results[name] = {
        'mean': scores.mean(),
        'std': scores.std()
    }
    print(f"{name}: {scores.mean():.3f} (+/- {scores.std():.3f})")

# Visualization
fig, ax = plt.subplots(figsize=(10, 6))
names = list(cv_results.keys())
means = [cv_results[name]['mean'] for name in names]
stds = [cv_results[name]['std'] for name in names]

ax.bar(names, means, yerr=stds, capsize=5, alpha=0.7, color=['blue', 'green', 'red'])
ax.set_ylabel('Accuracy')
ax.set_title('Model Comparison with 5-Fold Cross-Validation')
ax.set_ylim([0.8, 1.0])
ax.axhline(y=0.9, color='gray', linestyle='--', alpha=0.5)
plt.xticks(rotation=15, ha='right')
plt.tight_layout()
plt.show()

11.5 Exercise 5: Hyperparameter Tuning with GridSearchCV

Task: Tune a Random Forest classifier using GridSearchCV.

Requirements: 1. Tune: n_estimators [50, 100, 200], max_depth [3, 5, 7, 10], min_samples_split [2, 5, 10] 2. Use 3-fold CV 3. Find best parameters and best score 4. Evaluate best model on test set

# Your code here
# Hint: Use GridSearchCV
Show Solution
from sklearn.model_selection import GridSearchCV

# Parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10]
}

# GridSearch
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
    rf, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1
)

print("Running GridSearchCV...")
grid_search.fit(X_train_scaled, y_train)

# Best parameters
print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.3f}")

# Evaluate on test set
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test_scaled, y_test)
print(f"Test accuracy: {test_accuracy:.3f}")

11.6 Exercise 6: K-Means Clustering

Task: Apply K-Means clustering to the Iris dataset (unsupervised).

Requirements: 1. Use K-Means with k=3 clusters 2. Compare predicted clusters to actual labels using adjusted_rand_score 3. Visualize clusters using first 2 principal components 4. Color points by predicted clusters

# Your code here
# Hint: Use KMeans and PCA for visualization
Show Solution
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import adjusted_rand_score
import matplotlib.pyplot as plt

# Standardize full dataset
X_scaled = StandardScaler().fit_transform(iris.data)

# K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)

# Compare to actual labels
ari = adjusted_rand_score(iris.target, clusters)
print(f"Adjusted Rand Index: {ari:.3f}")

# PCA for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Predicted clusters
ax1.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', alpha=0.6)
ax1.scatter(kmeans.cluster_centers_[:, :2][:, 0],
           kmeans.cluster_centers_[:, :2][:, 1],
           c='red', marker='X', s=200, edgecolors='black', label='Centroids')
ax1.set_xlabel('First Principal Component')
ax1.set_ylabel('Second Principal Component')
ax1.set_title('K-Means Clustering (Predicted)')
ax1.legend()

# Actual labels
scatter = ax2.scatter(X_pca[:, 0], X_pca[:, 1], c=iris.target, cmap='viridis', alpha=0.6)
ax2.set_xlabel('First Principal Component')
ax2.set_ylabel('Second Principal Component')
ax2.set_title('Actual Iris Species')
plt.colorbar(scatter, ax=ax2)

plt.tight_layout()
plt.show()

print(f"\nExplained variance by 2 components: {pca.explained_variance_ratio_.sum():.2%}")

11.7 Exercise 7: Feature Engineering & Pipelines

Task: Create a complete ML pipeline with feature engineering.

Requirements: 1. Load the Boston housing dataset (or use California housing) 2. Create new features: total_rooms_per_household, bedrooms_per_room (if applicable) 3. Build a pipeline: StandardScaler → LinearRegression 4. Evaluate using cross_val_score with neg_mean_squared_error 5. Calculate RMSE and R²

# Your code here
# Hint: Use Pipeline and make_pipeline
Show Solution
from sklearn.datasets import fetch_california_housing
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load dataset
housing = fetch_california_housing()
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = housing.target

# Feature engineering
X['rooms_per_household'] = X['AveRooms']
X['bedrooms_per_room'] = X['AveBedrms'] / X['AveRooms']
X['population_per_household'] = X['Population'] / X['HouseAge']

print("Features after engineering:")
print(X.columns.tolist())

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

# Train
pipeline.fit(X_train, y_train)

# Cross-validation
cv_scores = cross_val_score(
    pipeline, X_train, y_train,
    cv=5, scoring='neg_mean_squared_error'
)
cv_rmse = np.sqrt(-cv_scores)
print(f"\nCV RMSE: {cv_rmse.mean():.3f} (+/- {cv_rmse.std():.3f})")

# Test evaluation
y_pred = pipeline.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
test_r2 = r2_score(y_test, y_pred)

print(f"\nTest RMSE: {test_rmse:.3f}")
print(f"Test R²: {test_r2:.3f}")

11.8 Exercise 8: Handling Imbalanced Data

Task: Create an imbalanced dataset and apply techniques to handle it.

Requirements: 1. Create imbalanced binary classification dataset (class ratio 1:9) 2. Train baseline model and note performance 3. Apply SMOTE to balance the dataset 4. Train new model on balanced data 5. Compare precision, recall, F1 for minority class

# Your code here
# Hint: Use make_classification with weights parameter
# Note: You'll need to install imbalanced-learn: pip install imbalanced-learn
Show Solution
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report

# Create imbalanced dataset
X_imb, y_imb = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=15,
    n_redundant=5,
    weights=[0.9, 0.1],  # 90% class 0, 10% class 1
    random_state=42
)

print(f"Class distribution: {np.bincount(y_imb)}")

# Split
X_train_imb, X_test_imb, y_train_imb, y_test_imb = train_test_split(
    X_imb, y_imb, test_size=0.3, random_state=42, stratify=y_imb
)

# Baseline model
baseline_model = LogisticRegression(random_state=42)
baseline_model.fit(X_train_imb, y_train_imb)
y_pred_baseline = baseline_model.predict(X_test_imb)

print("\n=== Baseline Model (Imbalanced) ===")
print(classification_report(y_test_imb, y_pred_baseline))

# Apply SMOTE (if available)
try:
    from imblearn.over_sampling import SMOTE

    smote = SMOTE(random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train_imb, y_train_imb)

    print(f"\nAfter SMOTE: {np.bincount(y_train_balanced)}")

    # Train on balanced data
    balanced_model = LogisticRegression(random_state=42)
    balanced_model.fit(X_train_balanced, y_train_balanced)
    y_pred_balanced = balanced_model.predict(X_test_imb)

    print("\n=== Model with SMOTE (Balanced) ===")
    print(classification_report(y_test_imb, y_pred_balanced))

except ImportError:
    print("\nSMOTE not available. Install with: pip install imbalanced-learn")
    print("Alternative: Use class_weight='balanced' in LogisticRegression")

    balanced_model = LogisticRegression(class_weight='balanced', random_state=42)
    balanced_model.fit(X_train_imb, y_train_imb)
    y_pred_balanced = balanced_model.predict(X_test_imb)

    print("\n=== Model with class_weight='balanced' ===")
    print(classification_report(y_test_imb, y_pred_balanced))

11.9 Exercise 9: Feature Importance & Selection

Task: Identify and select the most important features.

Requirements: 1. Use the Iris dataset with all features 2. Train a Random Forest and extract feature importances 3. Plot feature importances (sorted) 4. Select top 2 features and retrain model 5. Compare performance: all features vs top 2 features

# Your code here
# Hint: Use feature_importances_ attribute
Show Solution
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# Use Iris dataset (already loaded from Exercise 1)
# Create fresh train/test split for this exercise
X_iris = iris.data
y_iris = iris.target

X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
    X_iris, y_iris, test_size=0.3, random_state=42, stratify=y_iris
)

# Scale
scaler_iris = StandardScaler()
X_train_iris_scaled = scaler_iris.fit_transform(X_train_iris)
X_test_iris_scaled = scaler_iris.transform(X_test_iris)

# Train Random Forest with all features
rf_full = RandomForestClassifier(n_estimators=100, random_state=42)
rf_full.fit(X_train_iris_scaled, y_train_iris)

# Get feature importances
importances = rf_full.feature_importances_
feature_names = iris.feature_names
indices = np.argsort(importances)[::-1]

# Plot
plt.figure(figsize=(10, 6))
plt.bar(range(len(importances)), importances[indices])
plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=45, ha='right')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance - Random Forest (Iris Dataset)')
plt.tight_layout()
plt.show()

# Print all features
print("Feature importances:")
for i in range(len(indices)):
    print(f"{i+1}. {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")

# Select top 2 features
top_2_indices = indices[:2]
X_train_top2 = X_train_iris_scaled[:, top_2_indices]
X_test_top2 = X_test_iris_scaled[:, top_2_indices]

# Train model with top 2 features
rf_top2 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_top2.fit(X_train_top2, y_train_iris)

# Compare performance
acc_full = rf_full.score(X_test_iris_scaled, y_test_iris)
acc_top2 = rf_top2.score(X_test_top2, y_test_iris)

print(f"\nAccuracy with all features: {acc_full:.3f}")
print(f"Accuracy with top 2 features: {acc_top2:.3f}")
print(f"Difference: {acc_full - acc_top2:.3f}")

11.10 Exercise 10: Complete ML Project

Task: Build a complete ML solution from scratch.

Requirements: 1. Load digits dataset (image classification) 2. Perform EDA: check shapes, visualize samples 3. Split data and apply preprocessing 4. Train multiple models and compare 5. Select best model and tune hyperparameters 6. Create final evaluation report with confusion matrix and classification report 7. Visualize some predictions (correct and incorrect)

# Your code here
# This is a comprehensive exercise - take your time!
Show Solution
from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

print("=== DIGITS CLASSIFICATION PROJECT ===\n")

# 1. Load data
digits = load_digits()
X = digits.data
y = digits.target

print(f"Dataset shape: {X.shape}")
print(f"Number of classes: {len(np.unique(y))}")
print(f"Image size: {digits.images[0].shape}")

# 2. EDA - Visualize samples
fig, axes = plt.subplots(2, 5, figsize=(12, 5))
for i, ax in enumerate(axes.flat):
    ax.imshow(digits.images[i], cmap='gray')
    ax.set_title(f'Label: {digits.target[i]}')
    ax.axis('off')
plt.suptitle('Sample Digits from Dataset')
plt.tight_layout()
plt.show()

# 3. Split and preprocess
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Compare models
print("\n=== Model Comparison ===")
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='rbf', random_state=42)
}

best_score = 0
best_model_name = None

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    score = model.score(X_test_scaled, y_test)
    print(f"{name}: {score:.3f}")
    if score > best_score:
        best_score = score
        best_model_name = name

print(f"\nBest model: {best_model_name}")

# 5. Tune best model (SVM)
print("\n=== Hyperparameter Tuning (SVM) ===")
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1],
    'kernel': ['rbf']
}

grid = GridSearchCV(SVC(random_state=42), param_grid, cv=3, n_jobs=-1, verbose=1)
grid.fit(X_train_scaled, y_train)

print(f"\nBest parameters: {grid.best_params_}")
print(f"Best CV score: {grid.best_score_:.3f}")

# 6. Final evaluation
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test_scaled)

print("\n=== Final Evaluation ===")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=digits.target_names)
fig, ax = plt.subplots(figsize=(10, 8))
disp.plot(ax=ax, cmap='Blues', values_format='d')
plt.title('Confusion Matrix - Digits Classification')
plt.tight_layout()
plt.show()

# 7. Visualize predictions
print("\n=== Sample Predictions ===")
sample_indices = np.random.choice(len(X_test), 10, replace=False)
fig, axes = plt.subplots(2, 5, figsize=(12, 5))

for idx, ax in zip(sample_indices, axes.flat):
    image = X_test[idx].reshape(8, 8)
    pred = y_pred[idx]
    true = y_test[idx]

    ax.imshow(image, cmap='gray')
    color = 'green' if pred == true else 'red'
    ax.set_title(f'True: {true}, Pred: {pred}', color=color)
    ax.axis('off')

plt.suptitle('Sample Predictions (Green=Correct, Red=Incorrect)')
plt.tight_layout()
plt.show()

# Find misclassified examples
misclassified = np.where(y_pred != y_test)[0]
print(f"\nNumber of misclassified samples: {len(misclassified)} / {len(y_test)}")

11.11 Summary

You’ve completed exercises covering:

  • ✅ Data loading and exploration
  • ✅ Data preprocessing and scaling
  • ✅ Classification and regression models
  • ✅ Cross-validation and model comparison
  • ✅ Hyperparameter tuning
  • ✅ Clustering and dimensionality reduction
  • ✅ Feature engineering and pipelines
  • ✅ Handling imbalanced data
  • ✅ Feature importance and selection
  • ✅ Complete end-to-end ML project

Next Steps: - Try these exercises with different datasets - Modify parameters and observe the effects - Combine techniques from multiple exercises - Apply these skills to your own projects

Challenge: Create your own dataset or find one on Kaggle and build a complete ML pipeline from scratch!