import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Generate simple data: y = x^2 + noise
np.random.seed(42)
X = np.linspace(0, 10, 50).reshape(-1, 1)
y = X.ravel()**2 + np.random.normal(0, 10, 50)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train models with different complexity
models = {
'Underfit (depth=1)': DecisionTreeRegressor(max_depth=1, random_state=42),
'Good Fit (depth=3)': DecisionTreeRegressor(max_depth=3, random_state=42),
'Overfit (depth=20)': DecisionTreeRegressor(max_depth=20, random_state=42),
}
results = []
for name, model in models.items():
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
results.append({
'Model': name,
'Train R²': train_score,
'Test R²': test_score,
'Gap': train_score - test_score
})
results_df = pd.DataFrame(results)
print(results_df)4 Model Validation & Overfitting
4.1 The Fundamental Problem
Overfitting is when your model memorizes training data instead of learning generalizable patterns.
Analogy: A student who memorizes answers performs well on practice tests but fails the real exam.
4.2 Demonstration: Overfitting in Action
Key insight: Large gap between train and test scores = overfitting!
4.3 Visualizing Overfitting
# Generate smooth prediction line
X_plot = np.linspace(0, 10, 300).reshape(-1, 1)
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for idx, (name, model) in enumerate(models.items()):
ax = axes[idx]
# Predictions
y_plot = model.predict(X_plot)
# Plot
ax.scatter(X_train, y_train, alpha=0.6, label='Training data')
ax.scatter(X_test, y_test, alpha=0.6, label='Test data')
ax.plot(X_plot, y_plot, 'r-', linewidth=2, label='Model prediction')
ax.set_title(name)
ax.set_xlabel('X')
ax.set_ylabel('y')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()4.4 Train/Validation/Test Split
Problem: Can’t tune hyperparameters on test set (that’s cheating!)
Solution: Three-way split
from sklearn.model_selection import train_test_split
# Create sample data
X = np.random.randn(1000, 5)
y = np.random.randint(0, 2, 1000)
# First split: 80% train+val, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Second split: 75% train, 25% validation (of the 80%)
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=0.25, random_state=42 # 0.25 * 0.8 = 0.2 of total
)
print(f"Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X):.0%})")
print(f"Validation set: {X_val.shape[0]} samples ({X_val.shape[0]/len(X):.0%})")
print(f"Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(X):.0%})")Usage: - Training set: Fit the model - Validation set: Tune hyperparameters - Test set: Final evaluation (use only once!)
4.5 Cross-Validation
Problem: Validation set might not be representative
Solution: K-Fold Cross-Validation
4.5.1 How It Works
- Split data into K folds (typically 5 or 10)
- Train on K-1 folds, validate on the remaining fold
- Repeat K times, each fold gets a turn as validation
- Average the K scores
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
# Sample data
X = np.random.randn(500, 10)
y = np.random.randint(0, 2, 500)
# Create model
model = DecisionTreeClassifier(max_depth=5, random_state=42)
# 5-fold cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"Cross-validation scores: {scores}")
print(f"Mean accuracy: {scores.mean():.3f} (+/- {scores.std():.3f})")4.5.2 Comparing Models with Cross-Validation
models_to_compare = {
'Depth 3': DecisionTreeClassifier(max_depth=3, random_state=42),
'Depth 5': DecisionTreeClassifier(max_depth=5, random_state=42),
'Depth 10': DecisionTreeClassifier(max_depth=10, random_state=42),
'Depth 20': DecisionTreeClassifier(max_depth=20, random_state=42),
}
cv_results = []
for name, model in models_to_compare.items():
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
cv_results.append({
'Model': name,
'Mean CV Score': scores.mean(),
'Std Dev': scores.std()
})
cv_df = pd.DataFrame(cv_results)
print(cv_df)4.6 Underfitting vs Overfitting
4.6.1 Underfitting
- Symptom: Poor performance on both training and test data
- Cause: Model too simple
- Solution: Increase model complexity (e.g., increase
max_depth)
4.6.2 Overfitting
- Symptom: Great training performance, poor test performance
- Cause: Model too complex or too little data
- Solution:
- Reduce complexity (decrease
max_depth, increasemin_samples_split) - Get more training data
- Use regularization
- Reduce complexity (decrease
4.6.3 Good Fit
- Symptom: Good performance on both training and test data
- Goal: Small gap between train and test scores
4.7 Learning Curves
Visualize how model performance changes with training data size.
from sklearn.model_selection import learning_curve
# Generate data
X = np.random.randn(1000, 10)
y = np.random.randint(0, 2, 1000)
# Calculate learning curves
train_sizes, train_scores, val_scores = learning_curve(
DecisionTreeClassifier(max_depth=5, random_state=42),
X, y,
cv=5,
train_sizes=np.linspace(0.1, 1.0, 10),
scoring='accuracy',
n_jobs=-1
)
# Calculate mean and std
train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)
val_mean = val_scores.mean(axis=1)
val_std = val_scores.std(axis=1)
# Plot
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, label='Training score', marker='o')
plt.plot(train_sizes, val_mean, label='Validation score', marker='o')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1)
plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1)
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy')
plt.title('Learning Curves')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()Interpretation: - Gap between curves: Indicates overfitting - Both curves plateau: More data won’t help much - Curves still rising: More data could improve performance
4.8 Validation Curves
Visualize how hyperparameters affect performance.
from sklearn.model_selection import validation_curve
param_range = range(1, 21)
train_scores, val_scores = validation_curve(
DecisionTreeClassifier(random_state=42),
X, y,
param_name='max_depth',
param_range=param_range,
cv=5,
scoring='accuracy'
)
train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)
val_mean = val_scores.mean(axis=1)
val_std = val_scores.std(axis=1)
plt.figure(figsize=(10, 6))
plt.plot(param_range, train_mean, label='Training score', marker='o')
plt.plot(param_range, val_mean, label='Validation score', marker='o')
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.1)
plt.fill_between(param_range, val_mean - val_std, val_mean + val_std, alpha=0.1)
plt.xlabel('max_depth')
plt.ylabel('Accuracy')
plt.title('Validation Curve')
plt.legend()
plt.grid(True, alpha=0.3)
plt.axvline(x=param_range[val_mean.argmax()], color='r', linestyle='--',
label=f'Best depth: {param_range[val_mean.argmax()]}')
plt.legend()
plt.tight_layout()
plt.show()
print(f"Best max_depth: {param_range[val_mean.argmax()]}")4.9 Best Practices
- Always use cross-validation for model selection
- Keep test set locked until final evaluation
- Watch for the gap between training and validation scores
- More data helps but has diminishing returns
- Start simple and increase complexity only if needed
4.10 Summary
- Overfitting: Model memorizes training data (high train score, low test score)
- Underfitting: Model too simple (low train score, low test score)
- Cross-validation: Robust way to estimate model performance
- Learning curves: Show if more data will help
- Validation curves: Find optimal hyperparameters
- Three-way split: Train, validation, test (60/20/20 or 70/15/15)
Next: We’ll explore ensemble methods that reduce overfitting!