import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# Create imbalanced dataset (10% positive class)
X, y = make_classification(
n_samples=1000,
n_features=20,
n_informative=15,
n_redundant=5,
weights=[0.9, 0.1], # 90% class 0, 10% class 1
random_state=42
)
print(f"Class distribution:")
print(pd.Series(y).value_counts())
print(f"Imbalance ratio: {pd.Series(y).value_counts()[0] / pd.Series(y).value_counts()[1]:.1f}:1")
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)9 Real-World Machine Learning
Moving from tutorials to production requires handling messy data and practical considerations.
9.1 Topics Covered
- Handling imbalanced datasets
- Feature engineering
- Pipelines for reproducibility
- Dealing with categorical variables
- Handling missing data strategies
- Model persistence
9.2 1. Handling Imbalanced Data
Problem: When one class dominates (e.g., 95% negative, 5% positive).
9.2.1 Create Imbalanced Dataset
9.2.2 Naive Approach (Fails!)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Train without addressing imbalance
rf_naive = RandomForestClassifier(random_state=42)
rf_naive.fit(X_train, y_train)
y_pred_naive = rf_naive.predict(X_test)
print("Naive approach (ignoring imbalance):")
print(f"Accuracy: {accuracy_score(y_test, y_pred_naive):.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_naive))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_naive))Problem: High accuracy but terrible recall on minority class!
9.2.3 Solution 1: Class Weights
# Use class_weight='balanced'
rf_balanced = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_balanced.fit(X_train, y_train)
y_pred_balanced = rf_balanced.predict(X_test)
print("\nWith class_weight='balanced':")
print(f"Accuracy: {accuracy_score(y_test, y_pred_balanced):.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_balanced))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_balanced))9.2.4 Solution 2: Resampling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
# Install imbalanced-learn: pip install imbalanced-learn
# SMOTE: Synthetic Minority Over-sampling
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print(f"\nAfter SMOTE:")
print(pd.Series(y_train_smote).value_counts())
rf_smote = RandomForestClassifier(random_state=42)
rf_smote.fit(X_train_smote, y_train_smote)
y_pred_smote = rf_smote.predict(X_test)
print("\nWith SMOTE:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_smote):.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_smote))9.2.5 Solution 3: Adjust Decision Threshold
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
# Get probability predictions
y_pred_proba = rf_naive.predict_proba(X_test)[:, 1]
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
# Find optimal threshold (Youden's J statistic)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print(f"\nOptimal threshold: {optimal_threshold:.3f} (default is 0.5)")
# Apply custom threshold
y_pred_custom = (y_pred_proba >= optimal_threshold).astype(int)
print("\nWith custom threshold:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_custom):.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_custom))
# Visualize ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, linewidth=2, label='ROC curve')
plt.plot([0, 1], [0, 1], 'k--', label='Random classifier')
plt.scatter(fpr[optimal_idx], tpr[optimal_idx], s=200, c='red',
label=f'Optimal threshold = {optimal_threshold:.2f}', zorder=3)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve with Optimal Threshold')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()9.3 2. Feature Engineering
Creating better features often improves models more than tuning hyperparameters.
9.3.1 Common Techniques
# Sample data
df = pd.DataFrame({
'date': pd.date_range('2024-01-01', periods=100),
'price': np.random.uniform(10, 100, 100),
'quantity': np.random.randint(1, 50, 100),
'category': np.random.choice(['A', 'B', 'C'], 100)
})
print("Original features:")
print(df.head())
# 1. Create interaction features
df['revenue'] = df['price'] * df['quantity']
# 2. Extract datetime features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
# 3. Binning continuous features
df['price_category'] = pd.cut(df['price'], bins=[0, 30, 70, 100],
labels=['low', 'medium', 'high'])
# 4. Aggregations (rolling statistics)
df['price_rolling_mean'] = df['price'].rolling(window=7, min_periods=1).mean()
print("\nWith engineered features:")
print(df.head(10))9.3.2 Polynomial Features
from sklearn.preprocessing import PolynomialFeatures
# Create polynomial features
X_simple = np.array([[2, 3], [3, 4], [4, 5]])
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_simple)
print("Original features:")
print(X_simple)
print(f"\nPolynomial features (degree=2):")
print(X_poly)
print(f"\nFeature names: {poly.get_feature_names_out(['x1', 'x2'])}")9.4 3. Pipelines for Reproducibility
Problem: Preprocessing steps scattered in code, hard to reproduce.
Solution: Scikit-learn Pipelines!
9.4.1 Basic Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
# Create pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', RandomForestClassifier(random_state=42))
])
# Fit (automatically scales then trains)
pipeline.fit(X_train, y_train)
# Predict (automatically scales then predicts)
y_pred = pipeline.predict(X_test)
accuracy = pipeline.score(X_test, y_test)
print(f"Pipeline accuracy: {accuracy:.3f}")9.4.2 Advanced Pipeline with Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
# Create sample data with mixed types
df_mixed = pd.DataFrame({
'age': [25, 30, np.nan, 35, 28],
'income': [50000, 60000, 55000, np.nan, 52000],
'city': ['NYC', 'LA', 'NYC', 'SF', 'LA'],
'purchased': [1, 0, 1, 1, 0]
})
# Separate features and target
X = df_mixed.drop('purchased', axis=1)
y = df_mixed['purchased']
# Define numeric and categorical columns
numeric_features = ['age', 'income']
categorical_features = ['city']
# Create preprocessing pipelines
numeric_transformer = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(drop='first', sparse_output=False))
])
# Combine with ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
]
)
# Full pipeline
full_pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(random_state=42))
])
# Fit and transform
full_pipeline.fit(X, y)
print("Pipeline steps:")
for name, step in full_pipeline.named_steps.items():
print(f" {name}: {step}")9.4.3 Pipeline with GridSearchCV
from sklearn.model_selection import GridSearchCV
# Define parameter grid (note the naming: step_name__parameter)
param_grid = {
'preprocessor__num__imputer__strategy': ['mean', 'median'],
'classifier__n_estimators': [50, 100],
'classifier__max_depth': [5, 10, None]
}
# Grid search with pipeline
grid_search = GridSearchCV(
full_pipeline,
param_grid,
cv=3,
scoring='accuracy'
)
# This would fit on actual data
# grid_search.fit(X_train, y_train)
print("Pipeline + GridSearch ready!")
print(f"Total combinations: {len(param_grid['preprocessor__num__imputer__strategy']) * len(param_grid['classifier__n_estimators']) * len(param_grid['classifier__max_depth'])}")9.5 4. Categorical Variable Encoding
9.5.1 One-Hot Encoding
from sklearn.preprocessing import OneHotEncoder
# Sample data
df_cat = pd.DataFrame({
'color': ['red', 'blue', 'green', 'red', 'blue'],
'size': ['S', 'M', 'L', 'M', 'S']
})
encoder = OneHotEncoder(sparse_output=False, drop='first') # drop first to avoid multicollinearity
encoded = encoder.fit_transform(df_cat)
encoded_df = pd.DataFrame(
encoded,
columns=encoder.get_feature_names_out()
)
print("Original:")
print(df_cat)
print("\nOne-Hot Encoded:")
print(encoded_df)9.5.2 Ordinal Encoding
from sklearn.preprocessing import OrdinalEncoder
# When categories have order
df_ordinal = pd.DataFrame({
'education': ['High School', 'Bachelor', 'Master', 'PhD', 'Bachelor']
})
# Define order
categories = [['High School', 'Bachelor', 'Master', 'PhD']]
ordinal_encoder = OrdinalEncoder(categories=categories)
encoded_ordinal = ordinal_encoder.fit_transform(df_ordinal)
print("Original:")
print(df_ordinal)
print("\nOrdinal Encoded:")
print(encoded_ordinal)9.5.3 Target Encoding (Advanced)
# Encode categorical based on target mean
df_target = pd.DataFrame({
'city': ['NYC', 'LA', 'NYC', 'SF', 'LA', 'NYC', 'SF', 'LA'],
'price': [100, 80, 95, 120, 85, 105, 115, 90]
})
# Calculate mean price per city
target_means = df_target.groupby('city')['price'].mean()
df_target['city_encoded'] = df_target['city'].map(target_means)
print("Target Encoding:")
print(df_target)
print(f"\nMean prices: {target_means.to_dict()}")9.6 5. Missing Data Strategies
from sklearn.impute import SimpleImputer, KNNImputer
# Create data with missing values
df_missing = pd.DataFrame({
'A': [1, 2, np.nan, 4, 5, np.nan, 7],
'B': [10, np.nan, 30, np.nan, 50, 60, 70],
'C': [100, 200, 300, 400, 500, 600, 700]
})
print("Original data:")
print(df_missing)
# Strategy 1: Mean/Median imputation
imputer_mean = SimpleImputer(strategy='mean')
df_mean = pd.DataFrame(imputer_mean.fit_transform(df_missing), columns=df_missing.columns)
print("\nMean imputation:")
print(df_mean)
# Strategy 2: KNN imputation (uses nearby samples)
imputer_knn = KNNImputer(n_neighbors=2)
df_knn = pd.DataFrame(imputer_knn.fit_transform(df_missing), columns=df_missing.columns)
print("\nKNN imputation:")
print(df_knn)
# Strategy 3: Forward fill (for time series)
df_ffill = df_missing.ffill()
print("\nForward fill:")
print(df_ffill)9.7 6. Model Persistence
Save and load models for production.
import joblib
from sklearn.ensemble import RandomForestClassifier
# Train a model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Save model
# joblib.dump(model, 'random_forest_model.pkl')
# print("Model saved!")
# Load model
# loaded_model = joblib.load('random_forest_model.pkl')
# predictions = loaded_model.predict(X_test)
# print(f"Loaded model accuracy: {loaded_model.score(X_test, y_test):.3f}")
print("Use joblib.dump() and joblib.load() for model persistence")9.7.1 Save Entire Pipeline
# Save preprocessing + model
# joblib.dump(full_pipeline, 'full_pipeline.pkl')
# Load and use
# loaded_pipeline = joblib.load('full_pipeline.pkl')
# predictions = loaded_pipeline.predict(new_data)
print("Save pipelines to ensure consistent preprocessing in production!")9.8 Best Practices Checklist
9.8.1 Data Preparation
- ✅ Handle missing values appropriately
- ✅ Encode categorical variables
- ✅ Scale/normalize features for distance-based algorithms
- ✅ Check for and handle outliers
- ✅ Split data before any preprocessing
9.8.2 Model Development
- ✅ Start with simple baseline
- ✅ Use cross-validation for evaluation
- ✅ Address class imbalance if present
- ✅ Feature engineering before hyperparameter tuning
- ✅ Use pipelines for reproducibility
9.8.3 Evaluation
- ✅ Choose appropriate metrics for your problem
- ✅ Examine confusion matrix
- ✅ Test on held-out data
- ✅ Monitor for overfitting
- ✅ Consider business impact of errors
9.8.4 Production
- ✅ Save models and preprocessing pipelines
- ✅ Version your models
- ✅ Monitor performance over time
- ✅ Plan for model retraining
- ✅ Document assumptions and limitations
9.9 Summary
- Imbalanced data: Use class weights, resampling, or adjust thresholds
- Feature engineering: Often more impactful than model tuning
- Pipelines: Essential for reproducible, production-ready code
- Categorical encoding: One-hot for nominal, ordinal for ordered
- Missing data: Choose strategy based on mechanism (MCAR, MAR, MNAR)
- Save pipelines: Not just models, to ensure consistent preprocessing
Next: Visualization techniques for ML!