10  Visualization for Machine Learning

Effective visualizations help understand data, debug models, and communicate results.

10.1 Libraries

This chapter relies on three core visualization libraries: Matplotlib (the foundational plotting library), Seaborn (built on Matplotlib with prettier defaults and statistical functions), and scikit-learn’s built-in visualization tools. We’ll configure them with sensible defaults for clean, publication-ready plots.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import confusion_matrix, roc_curve, auc

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

10.2 1. Exploratory Data Analysis (EDA)

Exploratory Data Analysis is the first step in any machine learning project. Visualizing distributions, relationships, and correlations helps you understand your data’s characteristics, spot anomalies, and guide feature engineering decisions. Let’s explore the most useful EDA visualizations.

10.2.1 Distribution Plots

# Generate sample data
np.random.seed(42)
data = pd.DataFrame({
    'age': np.random.normal(40, 15, 1000),
    'income': np.random.lognormal(10, 1, 1000),
    'score': np.random.beta(5, 2, 1000) * 100
})

# Histograms
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, col in enumerate(data.columns):
    axes[idx].hist(data[col], bins=30, edgecolor='black', alpha=0.7)
    axes[idx].set_xlabel(col.capitalize())
    axes[idx].set_ylabel('Frequency')
    axes[idx].set_title(f'Distribution of {col.capitalize()}')
    axes[idx].grid(alpha=0.3)

plt.tight_layout()
plt.show()

10.2.2 Box Plots (Detecting Outliers)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, col in enumerate(data.columns):
    axes[idx].boxplot(data[col])
    axes[idx].set_ylabel(col.capitalize())
    axes[idx].set_title(f'Box Plot: {col.capitalize()}')
    axes[idx].grid(alpha=0.3)

plt.tight_layout()
plt.show()

10.2.3 Correlation Heatmap

# Add target variable
data['target'] = (
    data['age'] * 0.5 +
    data['income'] / 1000 +
    data['score'] * 2 +
    np.random.normal(0, 20, len(data))
)

# Correlation matrix
correlation = data.corr()

# Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0,
            fmt='.2f', square=True, linewidths=1)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()

print("Strong correlations with target:")
print(correlation['target'].sort_values(ascending=False))

10.2.4 Pair Plot

# Pair plot (shows all pairwise relationships)
sample_data = data.sample(200)  # Use subset for clarity

sns.pairplot(sample_data, diag_kind='kde', plot_kws={'alpha': 0.6})
plt.suptitle('Pair Plot of Features', y=1.01)
plt.tight_layout()
plt.show()

10.3 2. Feature Importance Visualization

# Train a model
X = data.drop('target', axis=1)
y = (data['target'] > data['target'].median()).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Feature importance
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

# Plot
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance (Random Forest)')
plt.gca().invert_yaxis()
plt.grid(alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

print(importance_df)

10.4 3. Model Performance Visualization

10.4.1 Confusion Matrix

from sklearn.metrics import ConfusionMatrixDisplay

# Predictions
y_pred = rf.predict(X_test)

# Plot confusion matrix
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# With counts
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=axes[0])
axes[0].set_title('Confusion Matrix (Counts)')

# With percentages
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=axes[1], normalize='true')
axes[1].set_title('Confusion Matrix (Normalized)')

plt.tight_layout()
plt.show()

10.4.2 ROC Curve

from sklearn.metrics import RocCurveDisplay

# Get probability predictions
y_pred_proba = rf.predict_proba(X_test)[:, 1]

# Plot ROC curve
plt.figure(figsize=(10, 8))
RocCurveDisplay.from_predictions(y_test, y_pred_proba)
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

# Calculate AUC
from sklearn.metrics import roc_auc_score
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC Score: {auc_score:.3f}")

10.4.3 Precision-Recall Curve

from sklearn.metrics import PrecisionRecallDisplay

# Especially useful for imbalanced datasets
plt.figure(figsize=(10, 8))
PrecisionRecallDisplay.from_predictions(y_test, y_pred_proba)
plt.title('Precision-Recall Curve')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

10.4.4 Learning Curves

from sklearn.model_selection import learning_curve

# Calculate learning curves
train_sizes, train_scores, val_scores = learning_curve(
    RandomForestClassifier(n_estimators=50, random_state=42),
    X, y,
    cv=5,
    train_sizes=np.linspace(0.1, 1.0, 10),
    scoring='accuracy',
    n_jobs=-1
)

# Calculate mean and std
train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)
val_mean = val_scores.mean(axis=1)
val_std = val_scores.std(axis=1)

# Plot
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, marker='o', label='Training score', linewidth=2)
plt.plot(train_sizes, val_mean, marker='o', label='Validation score', linewidth=2)

plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1)
plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1)

plt.xlabel('Training Set Size')
plt.ylabel('Accuracy')
plt.title('Learning Curves')
plt.legend(loc='best')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

10.4.5 Validation Curves

from sklearn.model_selection import validation_curve

# Vary max_depth
param_range = range(1, 21)
train_scores, val_scores = validation_curve(
    RandomForestClassifier(n_estimators=50, random_state=42),
    X, y,
    param_name='max_depth',
    param_range=param_range,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)
val_mean = val_scores.mean(axis=1)
val_std = val_scores.std(axis=1)

plt.figure(figsize=(10, 6))
plt.plot(param_range, train_mean, marker='o', label='Training score', linewidth=2)
plt.plot(param_range, val_mean, marker='o', label='Validation score', linewidth=2)

plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.1)
plt.fill_between(param_range, val_mean - val_std, val_mean + val_std, alpha=0.1)

plt.xlabel('max_depth')
plt.ylabel('Accuracy')
plt.title('Validation Curve: max_depth vs Accuracy')
plt.legend(loc='best')
plt.grid(alpha=0.3)

# Mark best value
best_depth = param_range[val_mean.argmax()]
plt.axvline(x=best_depth, color='r', linestyle='--', label=f'Best depth: {best_depth}')
plt.legend()

plt.tight_layout()
plt.show()

print(f"Optimal max_depth: {best_depth}")

10.5 4. Regression Visualizations

10.5.1 Actual vs Predicted

# Create regression dataset
X_reg, y_reg = make_regression(n_samples=500, n_features=10, noise=20, random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Train model
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train_reg, y_train_reg)

# Predictions
y_pred_reg = rf_reg.predict(X_test_reg)

# Plot actual vs predicted
plt.figure(figsize=(10, 6))
plt.scatter(y_test_reg, y_pred_reg, alpha=0.6)
plt.plot([y_test_reg.min(), y_test_reg.max()],
         [y_test_reg.min(), y_test_reg.max()],
         'r--', linewidth=2, label='Perfect prediction')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

# Calculate R²
from sklearn.metrics import r2_score
r2 = r2_score(y_test_reg, y_pred_reg)
print(f"R² Score: {r2:.3f}")

10.5.2 Residual Plot

# Calculate residuals
residuals = y_test_reg - y_pred_reg

# Residual plot
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Residuals vs Predicted
axes[0].scatter(y_pred_reg, residuals, alpha=0.6)
axes[0].axhline(y=0, color='r', linestyle='--', linewidth=2)
axes[0].set_xlabel('Predicted Values')
axes[0].set_ylabel('Residuals')
axes[0].set_title('Residual Plot')
axes[0].grid(alpha=0.3)

# Residual distribution
axes[1].hist(residuals, bins=30, edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Residuals')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Residuals')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Mean residual: {residuals.mean():.3f} (should be close to 0)")
print(f"Std of residuals: {residuals.std():.3f}")

10.6 5. Decision Boundary Visualization (2D)

# Create 2D dataset for visualization
X_2d, y_2d = make_classification(
    n_samples=300,
    n_features=2,
    n_redundant=0,
    n_informative=2,
    n_clusters_per_class=1,
    random_state=42
)

# Train classifier
rf_2d = RandomForestClassifier(n_estimators=100, random_state=42)
rf_2d.fit(X_2d, y_2d)

# Create mesh
h = 0.02
x_min, x_max = X_2d[:, 0].min() - 1, X_2d[:, 0].max() + 1
y_min, y_max = X_2d[:, 1].min() - 1, X_2d[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Predict on mesh
Z = rf_2d.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot
plt.figure(figsize=(12, 8))
plt.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y_2d, cmap='RdYlBu', edgecolors='black', s=50)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Decision Boundary Visualization')
plt.colorbar(label='Class')
plt.tight_layout()
plt.show()

10.7 6. Clustering Visualization

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

# Create clustered data
X_clusters, y_true = make_blobs(n_samples=300, centers=4, n_features=2,
                                 cluster_std=0.60, random_state=42)

# Fit KMeans
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
y_kmeans = kmeans.fit_predict(X_clusters)

# Plot
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.scatter(X_clusters[:, 0], X_clusters[:, 1], c=y_true, cmap='viridis', alpha=0.6, s=50)
plt.title('True Clusters')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

plt.subplot(1, 2, 2)
plt.scatter(X_clusters[:, 0], X_clusters[:, 1], c=y_kmeans, cmap='viridis', alpha=0.6, s=50)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
            c='red', marker='X', s=200, edgecolors='black', label='Centroids')
plt.title('K-Means Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()

plt.tight_layout()
plt.show()

10.8 7. Dimensionality Reduction Visualization

10.8.1 PCA Visualization

from sklearn.decomposition import PCA
from sklearn.datasets import load_iris

# Load iris dataset (4D)
iris = load_iris()
X_iris = iris.data
y_iris = iris.target

# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_iris)

# Plot
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_iris, cmap='viridis', alpha=0.7, s=50)
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
plt.title('PCA Projection (2D)')
plt.colorbar(scatter, label='Species')

plt.subplot(1, 2, 2)
plt.bar(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_)
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Variance Explained by PCA')
plt.grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print(f"Total variance explained: {pca.explained_variance_ratio_.sum():.2%}")

10.8.2 t-SNE Visualization

from sklearn.manifold import TSNE

# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X_iris)

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# PCA
scatter1 = axes[0].scatter(X_pca[:, 0], X_pca[:, 1], c=y_iris, cmap='viridis', alpha=0.7)
axes[0].set_xlabel('PC1')
axes[0].set_ylabel('PC2')
axes[0].set_title('PCA')
plt.colorbar(scatter1, ax=axes[0], label='Species')

# t-SNE
scatter2 = axes[1].scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_iris, cmap='viridis', alpha=0.7)
axes[1].set_xlabel('t-SNE 1')
axes[1].set_ylabel('t-SNE 2')
axes[1].set_title('t-SNE')
plt.colorbar(scatter2, ax=axes[1], label='Species')

plt.tight_layout()
plt.show()

10.9 Visualization Best Practices

10.9.1 1. Choose the Right Plot

Goal Plot Type
Distribution Histogram, KDE, Box plot
Relationships Scatter plot, Pair plot
Comparisons Bar chart, Box plot
Composition Stacked bar, Pie chart (use sparingly)
Trends Line plot, Area plot

10.9.2 2. Make It Clear

  • Label axes with units
  • Add titles that explain the visualization
  • Use color wisely (colorblind-friendly palettes)
  • Include legends when multiple categories
  • Annotate important points

10.9.3 3. Avoid Common Mistakes

  • ❌ Too many colors/categories
  • ❌ Misleading axes (truncated, non-zero origin)
  • ❌ 3D plots for 2D data
  • ❌ Pie charts with too many slices
  • ❌ Cluttered plots

10.10 Summary

  • EDA visualizations: Histograms, box plots, correlation heatmaps
  • Feature importance: Bar charts of feature weights
  • Classification metrics: Confusion matrix, ROC curve, precision-recall
  • Regression metrics: Actual vs predicted, residual plots
  • Model tuning: Learning curves, validation curves
  • Dimensionality reduction: PCA, t-SNE for visualization
  • Always label axes, titles, and use appropriate color schemes

Next: Advanced techniques with XGBoost!