import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
# Create synthetic customer churn dataset
np.random.seed(42)
n_samples = 1000
data = pd.DataFrame({
'monthly_charges': np.random.uniform(20, 150, n_samples),
'total_charges': np.random.uniform(100, 8000, n_samples),
'tenure_months': np.random.randint(1, 72, n_samples),
'num_support_calls': np.random.poisson(2, n_samples),
'contract_type': np.random.choice([0, 1, 2], n_samples), # 0: monthly, 1: yearly, 2: 2-year
})
# Generate churn based on features
churn_prob = (
0.1
+ (data['monthly_charges'] > 100) * 0.2
+ (data['num_support_calls'] > 3) * 0.3
+ (data['tenure_months'] < 12) * 0.25
- (data['contract_type'] == 2) * 0.2
)
data['churned'] = (np.random.random(n_samples) < churn_prob).astype(int)
print(data.head(10))
print(f"\nChurn rate: {data['churned'].mean():.1%}")
print(f"Class distribution:\n{data['churned'].value_counts()}")6 Supervised Learning - Classification
Classification predicts categorical labels (spam/not spam, disease type, customer segment).
6.1 Algorithms Covered
- Logistic Regression
- Decision Tree Classifier
- Random Forest Classifier
- Support Vector Machine (SVM)
- K-Nearest Neighbors (KNN)
6.2 Dataset: Customer Churn Prediction
For this classification chapter, we’ll work with a customer churn prediction dataset. Churn prediction is a common business problem where companies try to identify customers likely to cancel their service. We’ll create synthetic data that includes features like monthly charges, contract type, and support call frequency—all factors that influence whether a customer stays or leaves.
6.3 Exploratory Data Analysis
Before jumping into modeling, it’s crucial to visualize our data to understand which features correlate with churn. EDA helps us identify patterns (like “customers with high support calls tend to churn”) and informs our modeling decisions. Let’s create visualizations to explore relationships between features and the churn outcome.
# Visualize relationships
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Monthly charges vs churn
axes[0, 0].hist([data[data['churned']==0]['monthly_charges'],
data[data['churned']==1]['monthly_charges']],
label=['Stayed', 'Churned'], bins=20, alpha=0.7)
axes[0, 0].set_xlabel('Monthly Charges')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_title('Monthly Charges by Churn Status')
axes[0, 0].legend()
# Tenure vs churn
axes[0, 1].hist([data[data['churned']==0]['tenure_months'],
data[data['churned']==1]['tenure_months']],
label=['Stayed', 'Churned'], bins=20, alpha=0.7)
axes[0, 1].set_xlabel('Tenure (months)')
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_title('Tenure by Churn Status')
axes[0, 1].legend()
# Support calls vs churn
churn_by_calls = data.groupby('num_support_calls')['churned'].mean()
axes[1, 0].bar(churn_by_calls.index, churn_by_calls.values)
axes[1, 0].set_xlabel('Number of Support Calls')
axes[1, 0].set_ylabel('Churn Rate')
axes[1, 0].set_title('Churn Rate by Support Calls')
# Contract type vs churn
churn_by_contract = data.groupby('contract_type')['churned'].mean()
axes[1, 1].bar(['Monthly', 'Yearly', '2-Year'], churn_by_contract.values)
axes[1, 1].set_ylabel('Churn Rate')
axes[1, 1].set_title('Churn Rate by Contract Type')
plt.tight_layout()
plt.show()6.4 Prepare Data
Now we’ll prepare our data for modeling by separating features from the target variable and splitting into training and test sets. Additionally, we’ll scale the features using StandardScaler—this is especially important for algorithms like SVM and KNN that are sensitive to feature magnitudes. Note that we use stratify=y to ensure both sets have similar churn rates.
from sklearn.preprocessing import StandardScaler
# Features and target
X = data.drop('churned', axis=1)
y = data['churned']
# Split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Scale features (important for SVM and KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Train churn rate: {y_train.mean():.1%}")
print(f"Test churn rate: {y_test.mean():.1%}")6.5 Classification Metrics Helper Function
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
f1_score, roc_auc_score, confusion_matrix,
classification_report)
def evaluate_classifier(name, y_true, y_pred, y_pred_proba=None):
"""Comprehensive classifier evaluation"""
print(f"\n{'='*50}")
print(f"{name} Results")
print('='*50)
# Basic metrics
print(f"Accuracy: {accuracy_score(y_true, y_pred):.3f}")
print(f"Precision: {precision_score(y_true, y_pred):.3f}")
print(f"Recall: {recall_score(y_true, y_pred):.3f}")
print(f"F1 Score: {f1_score(y_true, y_pred):.3f}")
if y_pred_proba is not None:
print(f"ROC AUC: {roc_auc_score(y_true, y_pred_proba):.3f}")
# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
print(f"\nConfusion Matrix:")
print(f" Predicted No Predicted Yes")
print(f"Actual No {cm[0, 0]:8d} {cm[0, 1]:8d}")
print(f"Actual Yes {cm[1, 0]:8d} {cm[1, 1]:8d}")
return {
'accuracy': accuracy_score(y_true, y_pred),
'precision': precision_score(y_true, y_pred),
'recall': recall_score(y_true, y_pred),
'f1': f1_score(y_true, y_pred),
'roc_auc': roc_auc_score(y_true, y_pred_proba) if y_pred_proba is not None else None
}6.6 1. Logistic Regression
Concept: Despite the name, it’s for classification! Estimates probability of belonging to a class.
from sklearn.linear_model import LogisticRegression
# Train
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train_scaled, y_train)
# Predict
y_pred_lr = log_reg.predict(X_test_scaled)
y_pred_proba_lr = log_reg.predict_proba(X_test_scaled)[:, 1]
# Evaluate
results_lr = evaluate_classifier('Logistic Regression', y_test, y_pred_lr, y_pred_proba_lr)
# Feature importance (coefficients)
coef_df = pd.DataFrame({
'Feature': X.columns,
'Coefficient': log_reg.coef_[0]
}).sort_values('Coefficient', key=abs, ascending=False)
print(f"\nFeature Coefficients:")
print(coef_df)6.7 2. Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
# Train
dt_clf = DecisionTreeClassifier(max_depth=5, min_samples_split=50, random_state=42)
dt_clf.fit(X_train, y_train)
# Predict
y_pred_dt = dt_clf.predict(X_test)
y_pred_proba_dt = dt_clf.predict_proba(X_test)[:, 1]
# Evaluate
results_dt = evaluate_classifier('Decision Tree', y_test, y_pred_dt, y_pred_proba_dt)
# Feature importance
importance_df = pd.DataFrame({
'Feature': X.columns,
'Importance': dt_clf.feature_importances_
}).sort_values('Importance', ascending=False)
print(f"\nFeature Importance:")
print(importance_df)6.8 3. Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
# Train
rf_clf = RandomForestClassifier(
n_estimators=100,
max_depth=10,
min_samples_split=20,
random_state=42,
n_jobs=-1
)
rf_clf.fit(X_train, y_train)
# Predict
y_pred_rf = rf_clf.predict(X_test)
y_pred_proba_rf = rf_clf.predict_proba(X_test)[:, 1]
# Evaluate
results_rf = evaluate_classifier('Random Forest', y_test, y_pred_rf, y_pred_proba_rf)
# Feature importance
importance_df_rf = pd.DataFrame({
'Feature': X.columns,
'Importance': rf_clf.feature_importances_
}).sort_values('Importance', ascending=False)
print(f"\nFeature Importance:")
print(importance_df_rf)6.9 4. Support Vector Machine (SVM)
Concept: Finds the optimal boundary (hyperplane) that separates classes.
from sklearn.svm import SVC
# Train
svm_clf = SVC(kernel='rbf', C=1.0, probability=True, random_state=42)
svm_clf.fit(X_train_scaled, y_train)
# Predict
y_pred_svm = svm_clf.predict(X_test_scaled)
y_pred_proba_svm = svm_clf.predict_proba(X_test_scaled)[:, 1]
# Evaluate
results_svm = evaluate_classifier('SVM', y_test, y_pred_svm, y_pred_proba_svm)Key parameters: - kernel: ‘linear’, ‘rbf’, ‘poly’ (rbf is default, works well generally) - C: Regularization (higher = less regularization)
6.10 5. K-Nearest Neighbors (KNN)
Concept: Classifies based on majority vote of K nearest neighbors.
from sklearn.neighbors import KNeighborsClassifier
# Train (actually just stores training data)
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train_scaled, y_train)
# Predict
y_pred_knn = knn_clf.predict(X_test_scaled)
y_pred_proba_knn = knn_clf.predict_proba(X_test_scaled)[:, 1]
# Evaluate
results_knn = evaluate_classifier('KNN', y_test, y_pred_knn, y_pred_proba_knn)Key parameter: n_neighbors (typically 3-10)
6.11 Model Comparison
# Compile results
comparison_df = pd.DataFrame({
'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM', 'KNN'],
'Accuracy': [results_lr['accuracy'], results_dt['accuracy'], results_rf['accuracy'],
results_svm['accuracy'], results_knn['accuracy']],
'Precision': [results_lr['precision'], results_dt['precision'], results_rf['precision'],
results_svm['precision'], results_knn['precision']],
'Recall': [results_lr['recall'], results_dt['recall'], results_rf['recall'],
results_svm['recall'], results_knn['recall']],
'F1': [results_lr['f1'], results_dt['f1'], results_rf['f1'],
results_svm['f1'], results_knn['f1']],
'ROC AUC': [results_lr['roc_auc'], results_dt['roc_auc'], results_rf['roc_auc'],
results_svm['roc_auc'], results_knn['roc_auc']]
})
print("\nModel Comparison:")
print(comparison_df.to_string(index=False))
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# Metrics comparison
metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1']
x = np.arange(len(comparison_df))
width = 0.2
for idx, metric in enumerate(metrics_to_plot):
axes[0].bar(x + idx*width, comparison_df[metric], width, label=metric)
axes[0].set_xlabel('Model')
axes[0].set_ylabel('Score')
axes[0].set_title('Classification Metrics Comparison')
axes[0].set_xticks(x + width * 1.5)
axes[0].set_xticklabels(comparison_df['Model'], rotation=45, ha='right')
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)
axes[0].set_ylim([0, 1])
# ROC AUC comparison
axes[1].barh(comparison_df['Model'], comparison_df['ROC AUC'])
axes[1].set_xlabel('ROC AUC Score')
axes[1].set_title('ROC AUC Comparison')
axes[1].grid(axis='x', alpha=0.3)
axes[1].set_xlim([0, 1])
plt.tight_layout()
plt.show()6.12 Understanding Classification Metrics
6.12.1 Confusion Matrix
Predicted No Predicted Yes
Actual No (TN) 100 20 (False Positive)
Actual Yes (FN) 10 70 (True Positive)
6.12.2 Metrics Explained
Accuracy: (TP + TN) / Total - Overall correctness - Misleading with imbalanced classes!
Precision: TP / (TP + FP) - “Of all predicted positives, how many were correct?” - Important when false positives are costly
Recall (Sensitivity): TP / (TP + FN) - “Of all actual positives, how many did we find?” - Important when false negatives are costly
F1 Score: 2 * (Precision * Recall) / (Precision + Recall) - Harmonic mean of precision and recall - Good balance when you care about both
ROC AUC: Area under ROC curve - Measures classifier’s ability to distinguish classes - Range: 0.5 (random) to 1.0 (perfect)
6.12.3 Which Metric to Optimize?
| Scenario | Optimize |
|---|---|
| Spam detection | Precision (avoid blocking real emails) |
| Disease diagnosis | Recall (don’t miss sick patients) |
| Fraud detection | F1 or Recall (catch fraudsters, but not too many false alarms) |
| Balanced dataset | Accuracy or F1 |
6.13 ROC Curve Visualization
from sklearn.metrics import roc_curve, auc
# Calculate ROC curves
models_proba = {
'Logistic Regression': y_pred_proba_lr,
'Random Forest': y_pred_proba_rf,
'SVM': y_pred_proba_svm,
'KNN': y_pred_proba_knn
}
plt.figure(figsize=(10, 8))
for name, y_proba in models_proba.items():
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.3f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()6.14 When to Use Each Algorithm
| Algorithm | Use When | Pros | Cons |
|---|---|---|---|
| Logistic Regression | Linear decision boundary, need interpretability | Fast, interpretable, probability estimates | Assumes linearity |
| Decision Tree | Need interpretability, non-linear data | Interpretable, handles non-linearity | Overfits easily |
| Random Forest | Want high accuracy, have enough data | Excellent performance, robust | Slow, black box |
| SVM | High-dimensional data, clear margin | Effective in high dimensions | Slow on large datasets |
| KNN | Small dataset, simple baseline | Simple, no training time | Slow prediction, sensitive to scaling |
6.15 Summary
- Classification predicts categories, not continuous values
- Logistic Regression is the baseline for binary classification
- Random Forest usually gives best performance
- Metrics matter: Choose based on your business problem
- Imbalanced classes require special attention (Chapter 9)
- Always examine confusion matrix to understand errors
Next: Unsupervised learning algorithms!