import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
# Create synthetic housing dataset
np.random.seed(42)
n_samples = 500
data = pd.DataFrame({
    'crime_rate': np.random.exponential(2, n_samples),
    'avg_rooms': np.random.normal(6, 1, n_samples),
    'age': np.random.uniform(0, 100, n_samples),
    'distance_to_city': np.random.exponential(5, n_samples),
    'tax_rate': np.random.uniform(200, 700, n_samples),
})
# Price generation with realistic relationships
data['price'] = (
    500000
    - data['crime_rate'] * 15000
    + data['avg_rooms'] * 50000
    - data['age'] * 500
    - data['distance_to_city'] * 8000
    - data['tax_rate'] * 100
    + np.random.normal(0, 30000, n_samples)
)
print(data.head())
print(f"\nDataset shape: {data.shape}")
print(f"\nPrice range: ${data['price'].min():,.0f} - ${data['price'].max():,.0f}")5 Supervised Learning - Regression
Regression predicts continuous values (prices, temperatures, quantities).
5.1 Algorithms Covered
- Linear Regression
- Ridge Regression (L2 regularization)
- Lasso Regression (L1 regularization)
- Decision Tree Regressor
- Random Forest Regressor
5.2 Dataset: Boston Housing (Simulated)
To demonstrate regression algorithms, we’ll create a synthetic housing dataset that mimics real-world pricing patterns. This simulated data includes features like crime rate, average number of rooms, property age, and tax rates—all factors that typically influence house prices. Using synthetic data allows us to control the relationships between features and understand exactly how each algorithm learns patterns.
5.3 Prepare Data
Before training any models, we need to separate our features (the input variables we’ll use for predictions) from our target (the variable we want to predict—in this case, house price). We’ll also split the data into training and test sets to ensure we can properly evaluate our models on unseen data.
# Features and target
X = data.drop('price', axis=1)
y = data['price']
# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")5.4 1. Linear Regression
Concept: Fits a straight line (or hyperplane) through the data.
Formula: y = b₀ + b₁x₁ + b₂x₂ + ... + bₙxₙ
from sklearn.linear_model import LinearRegression
# Train
lr = LinearRegression()
lr.fit(X_train, y_train)
# Predict
y_pred_lr = lr.predict(X_test)
# Evaluate
mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)
print(f"Linear Regression Results:")
print(f"  MAE: ${mae_lr:,.2f}")
print(f"  RMSE: ${rmse_lr:,.2f}")
print(f"  R²: {r2_lr:.3f}")
# Coefficients
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lr.coef_
}).sort_values('Coefficient', key=abs, ascending=False)
print(f"\nFeature Coefficients:")
print(coef_df)Interpretation: Each coefficient shows how price changes per unit increase in that feature.
5.5 2. Ridge Regression (L2 Regularization)
Concept: Linear regression with penalty on large coefficients (prevents overfitting).
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
# Scale features (important for regularization!)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train Ridge
ridge = Ridge(alpha=100)  # alpha controls regularization strength
ridge.fit(X_train_scaled, y_train)
# Predict
y_pred_ridge = ridge.predict(X_test_scaled)
# Evaluate
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
r2_ridge = r2_score(y_test, y_pred_ridge)
print(f"Ridge Regression Results:")
print(f"  MAE: ${mae_ridge:,.2f}")
print(f"  RMSE: ${rmse_ridge:,.2f}")
print(f"  R²: {r2_ridge:.3f}")
# Compare coefficients
coef_comparison = pd.DataFrame({
    'Feature': X.columns,
    'Linear Reg': lr.coef_,
    'Ridge': ridge.coef_
})
print(f"\nCoefficient Comparison:")
print(coef_comparison)Key parameter: alpha (higher = more regularization = simpler model)
5.6 3. Lasso Regression (L1 Regularization)
Concept: Can shrink coefficients to exactly zero (feature selection!).
from sklearn.linear_model import Lasso
# Train Lasso
lasso = Lasso(alpha=1000, max_iter=10000)
lasso.fit(X_train_scaled, y_train)
# Predict
y_pred_lasso = lasso.predict(X_test_scaled)
# Evaluate
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
r2_lasso = r2_score(y_test, y_pred_lasso)
print(f"Lasso Regression Results:")
print(f"  MAE: ${mae_lasso:,.2f}")
print(f"  RMSE: ${rmse_lasso:,.2f}")
print(f"  R²: {r2_lasso:.3f}")
# Feature selection
coef_lasso = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lasso.coef_
})
print(f"\nLasso Coefficients (some may be 0):")
print(coef_lasso)
print(f"\nFeatures kept: {(lasso.coef_ != 0).sum()}/{len(X.columns)}")5.7 4. Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
# Train
dt = DecisionTreeRegressor(max_depth=10, min_samples_split=20, random_state=42)
dt.fit(X_train, y_train)
# Predict
y_pred_dt = dt.predict(X_test)
# Evaluate
mae_dt = mean_absolute_error(y_test, y_pred_dt)
rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))
r2_dt = r2_score(y_test, y_pred_dt)
print(f"Decision Tree Results:")
print(f"  MAE: ${mae_dt:,.2f}")
print(f"  RMSE: ${rmse_dt:,.2f}")
print(f"  R²: {r2_dt:.3f}")5.8 5. Random Forest Regressor
Concept: Ensemble of decision trees (averages predictions).
from sklearn.ensemble import RandomForestRegressor
# Train
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=20,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
# Predict
y_pred_rf = rf.predict(X_test)
# Evaluate
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest Results:")
print(f"  MAE: ${mae_rf:,.2f}")
print(f"  RMSE: ${rmse_rf:,.2f}")
print(f"  R²: {r2_rf:.3f}")
# Feature importance
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)
print(f"\nFeature Importance:")
print(importance_df)5.9 Model Comparison
# Create comparison table
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Ridge', 'Lasso', 'Decision Tree', 'Random Forest'],
    'MAE': [mae_lr, mae_ridge, mae_lasso, mae_dt, mae_rf],
    'RMSE': [rmse_lr, rmse_ridge, rmse_lasso, rmse_dt, rmse_rf],
    'R²': [r2_lr, r2_ridge, r2_lasso, r2_dt, r2_rf]
})
results = results.sort_values('R²', ascending=False)
print("\nModel Comparison:")
print(results.to_string(index=False))
# Visualize
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
metrics = ['MAE', 'RMSE', 'R²']
for idx, metric in enumerate(metrics):
    ax = axes[idx]
    bars = ax.barh(results['Model'], results[metric])
    ax.set_xlabel(metric)
    ax.set_title(f'{metric} Comparison')
    ax.grid(axis='x', alpha=0.3)
    # Color best performer
    best_idx = results[metric].argmax() if metric == 'R²' else results[metric].argmin()
    bars[best_idx].set_color('green')
plt.tight_layout()
plt.show()5.10 Regression Metrics Explained
5.10.1 Mean Absolute Error (MAE)
- Average absolute difference between predictions and actual
- Interpretation: “On average, predictions are off by $X”
- Lower is better
5.10.2 Root Mean Squared Error (RMSE)
- Penalizes large errors more than MAE
- Same unit as target variable
- Lower is better
5.10.3 R² Score (Coefficient of Determination)
- Proportion of variance explained (0 to 1)
- Interpretation: “Model explains X% of price variation”
- Higher is better (1.0 = perfect, 0.0 = baseline)
5.11 When to Use Each Algorithm
| Algorithm | Use When | Pros | Cons | 
|---|---|---|---|
| Linear Regression | Linear relationships, interpretability needed | Fast, interpretable | Assumes linearity | 
| Ridge | Many correlated features | Reduces overfitting | Less interpretable than linear | 
| Lasso | Need feature selection | Automatic feature selection | Can be unstable | 
| Decision Tree | Non-linear data, need interpretability | Handles non-linearity | Prone to overfitting | 
| Random Forest | Best accuracy needed | Usually best performance | Slow, black box | 
5.12 Summary
- Linear models work well for linear relationships
- Regularization (Ridge/Lasso) prevents overfitting
- Tree-based models handle non-linear relationships
- Random Forest often gives best results (ensemble power!)
- Always compare multiple algorithms on your specific data
Next: Classification algorithms!