# Multi-Layer Perceptron & Backpropagation

This notebook demonstrates:
- MLP architecture and forward propagation
- Activation functions and their derivatives
- Backpropagation algorithm step-by-step
- Training a simple MLP on real data

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons, make_circles
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

plt.style.use('seaborn-v0_8-darkgrid')
np.random.seed(42)

## 1. Activation Functions and Their Derivatives

Understanding activation functions is crucial for neural networks:
- **Sigmoid**: $\sigma(z) = \frac{1}{1 + e^{-z}}$
- **Tanh**: $\tanh(z) = \frac{e^z - e^{-z}}{e^z + e^{-z}}$
- **ReLU**: $\text{ReLU}(z) = \max(0, z)$

In [None]:
# Activation functions and derivatives
def sigmoid(z):
 return 1 / (1 + np.exp(-np.clip(z, -500, 500)))

def sigmoid_derivative(z):
 s = sigmoid(z)
 return s * (1 - s)

def tanh(z):
 return np.tanh(z)

def tanh_derivative(z):
 return 1 - np.tanh(z)**2

def relu(z):
 return np.maximum(0, z)

def relu_derivative(z):
 return (z > 0).astype(float)

# Visualize activation functions
z = np.linspace(-5, 5, 100)

fig, axes = plt.subplots(2, 3, figsize=(15, 8))

# Sigmoid
axes[0, 0].plot(z, sigmoid(z), 'b-', linewidth=2, label='sigmoid(z)')
axes[0, 0].axhline(y=0, color='k', linestyle='--', alpha=0.3)
axes[0, 0].axhline(y=1, color='k', linestyle='--', alpha=0.3)
axes[0, 0].set_title('Sigmoid Function', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('z')
axes[0, 0].set_ylabel('σ(z)')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].legend()

axes[1, 0].plot(z, sigmoid_derivative(z), 'r-', linewidth=2, label="σ'(z)")
axes[1, 0].set_title('Sigmoid Derivative', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('z')
axes[1, 0].set_ylabel("σ'(z)")
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].legend()

# Tanh
axes[0, 1].plot(z, tanh(z), 'g-', linewidth=2, label='tanh(z)')
axes[0, 1].axhline(y=0, color='k', linestyle='--', alpha=0.3)
axes[0, 1].axhline(y=1, color='k', linestyle='--', alpha=0.3)
axes[0, 1].axhline(y=-1, color='k', linestyle='--', alpha=0.3)
axes[0, 1].set_title('Tanh Function', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('z')
axes[0, 1].set_ylabel('tanh(z)')
axes[0, 1].grid(True, alpha=0.3)
axes[0, 1].legend()

axes[1, 1].plot(z, tanh_derivative(z), 'orange', linewidth=2, label="tanh'(z)")
axes[1, 1].set_title('Tanh Derivative', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('z')
axes[1, 1].set_ylabel("tanh'(z)")
axes[1, 1].grid(True, alpha=0.3)
axes[1, 1].legend()

# ReLU
axes[0, 2].plot(z, relu(z), 'm-', linewidth=2, label='ReLU(z)')
axes[0, 2].axhline(y=0, color='k', linestyle='--', alpha=0.3)
axes[0, 2].set_title('ReLU Function', fontsize=12, fontweight='bold')
axes[0, 2].set_xlabel('z')
axes[0, 2].set_ylabel('ReLU(z)')
axes[0, 2].grid(True, alpha=0.3)
axes[0, 2].legend()

axes[1, 2].plot(z, relu_derivative(z), 'c-', linewidth=2, label="ReLU'(z)")
axes[1, 2].set_title('ReLU Derivative', fontsize=12, fontweight='bold')
axes[1, 2].set_xlabel('z')
axes[1, 2].set_ylabel("ReLU'(z)")
axes[1, 2].grid(True, alpha=0.3)
axes[1, 2].legend()

plt.tight_layout()
plt.show()

print("Key Observations:")
print("- Sigmoid: Saturates at 0 and 1, derivative max at 0.25")
print("- Tanh: Centered at 0, saturates at -1 and 1, stronger gradients")
print("- ReLU: No saturation for positive values, dead neurons for negative")

## 2. Simple MLP Implementation

A 2-layer MLP with:
- Input layer: 2 features
- Hidden layer: 4 neurons with tanh activation
- Output layer: 1 neuron with sigmoid activation

In [None]:
class SimpleMLP:
 def __init__(self, input_size=2, hidden_size=4, output_size=1):
 # Initialize weights with Xavier initialization
 self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
 self.b1 = np.zeros((1, hidden_size))
 self.W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / hidden_size)
 self.b2 = np.zeros((1, output_size))
 
 # For storing intermediate values during forward pass
 self.cache = {}
 
 def forward(self, X):
 """Forward propagation"""
 # Layer 1: Input -> Hidden
 self.cache['Z1'] = np.dot(X, self.W1) + self.b1
 self.cache['A1'] = tanh(self.cache['Z1'])
 
 # Layer 2: Hidden -> Output
 self.cache['Z2'] = np.dot(self.cache['A1'], self.W2) + self.b2
 self.cache['A2'] = sigmoid(self.cache['Z2'])
 
 return self.cache['A2']
 
 def backward(self, X, y, learning_rate=0.01):
 """Backpropagation"""
 m = X.shape[0]
 
 # Output layer gradients
 dZ2 = self.cache['A2'] - y
 dW2 = (1/m) * np.dot(self.cache['A1'].T, dZ2)
 db2 = (1/m) * np.sum(dZ2, axis=0, keepdims=True)
 
 # Hidden layer gradients
 dA1 = np.dot(dZ2, self.W2.T)
 dZ1 = dA1 * tanh_derivative(self.cache['Z1'])
 dW1 = (1/m) * np.dot(X.T, dZ1)
 db1 = (1/m) * np.sum(dZ1, axis=0, keepdims=True)
 
 # Update parameters
 self.W2 -= learning_rate * dW2
 self.b2 -= learning_rate * db2
 self.W1 -= learning_rate * dW1
 self.b1 -= learning_rate * db1
 
 return dW2, db2, dW1, db1
 
 def compute_loss(self, y_pred, y_true):
 """Binary cross-entropy loss"""
 m = y_true.shape[0]
 loss = -(1/m) * np.sum(y_true * np.log(y_pred + 1e-8) + 
 (1 - y_true) * np.log(1 - y_pred + 1e-8))
 return loss
 
 def train(self, X, y, epochs=1000, learning_rate=0.1, verbose=True):
 """Training loop"""
 losses = []
 
 for epoch in range(epochs):
 # Forward pass
 y_pred = self.forward(X)
 
 # Compute loss
 loss = self.compute_loss(y_pred, y)
 losses.append(loss)
 
 # Backward pass
 self.backward(X, y, learning_rate)
 
 if verbose and (epoch % 100 == 0 or epoch == epochs - 1):
 accuracy = np.mean((y_pred > 0.5) == y)
 print(f"Epoch {epoch:4d}: Loss = {loss:.4f}, Accuracy = {accuracy:.4f}")
 
 return losses

print("SimpleMLP class defined successfully!")

## 3. Training on Moons Dataset

Let's train our MLP on a non-linearly separable dataset

In [None]:
# Generate moons dataset
X, y = make_moons(n_samples=300, noise=0.2, random_state=42)
y = y.reshape(-1, 1) # Reshape for consistency

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Visualize dataset
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.scatter(X_train[y_train.ravel()==0, 0], X_train[y_train.ravel()==0, 1], 
 c='blue', label='Class 0', alpha=0.6, edgecolors='k')
plt.scatter(X_train[y_train.ravel()==1, 0], X_train[y_train.ravel()==1, 1], 
 c='red', label='Class 1', alpha=0.6, edgecolors='k')
plt.title('Training Data', fontsize=14, fontweight='bold')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.scatter(X_test[y_test.ravel()==0, 0], X_test[y_test.ravel()==0, 1], 
 c='blue', label='Class 0', alpha=0.6, edgecolors='k')
plt.scatter(X_test[y_test.ravel()==1, 0], X_test[y_test.ravel()==1, 1], 
 c='red', label='Class 1', alpha=0.6, edgecolors='k')
plt.title('Test Data', fontsize=14, fontweight='bold')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Training samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")

In [None]:
# Train the model
mlp = SimpleMLP(input_size=2, hidden_size=8, output_size=1)
losses = mlp.train(X_train, y_train, epochs=2000, learning_rate=0.5, verbose=True)

# Evaluate on test set
y_test_pred = mlp.forward(X_test)
test_accuracy = np.mean((y_test_pred > 0.5) == y_test)
print(f"\nTest Accuracy: {test_accuracy:.4f}")

## 4. Visualizing Training Progress and Decision Boundary

In [None]:
# Plot loss curve
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.plot(losses, linewidth=2)
plt.title('Training Loss Over Time', fontsize=14, fontweight='bold')
plt.xlabel('Epoch')
plt.ylabel('Binary Cross-Entropy Loss')
plt.grid(True, alpha=0.3)

plt.subplot(1, 3, 2)
plt.plot(losses[100:], linewidth=2, color='orange')
plt.title('Training Loss (After Epoch 100)', fontsize=14, fontweight='bold')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True, alpha=0.3)

# Decision boundary
plt.subplot(1, 3, 3)
h = 0.02
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = mlp.forward(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.contourf(xx, yy, Z, levels=20, cmap='RdBu', alpha=0.6)
plt.colorbar(label='Prediction Probability')
plt.scatter(X_test[y_test.ravel()==0, 0], X_test[y_test.ravel()==0, 1], 
 c='blue', label='Class 0', edgecolors='k', s=60)
plt.scatter(X_test[y_test.ravel()==1, 0], X_test[y_test.ravel()==1, 1], 
 c='red', label='Class 1', edgecolors='k', s=60)
plt.title('Decision Boundary', fontsize=14, fontweight='bold')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()

plt.tight_layout()
plt.show()

## 5. Gradient Flow Visualization

Understanding how gradients flow through the network

In [None]:
# Train a fresh model and track gradient magnitudes
mlp_grad = SimpleMLP(input_size=2, hidden_size=8, output_size=1)

grad_magnitudes_W1 = []
grad_magnitudes_W2 = []

for epoch in range(500):
 y_pred = mlp_grad.forward(X_train)
 dW2, db2, dW1, db1 = mlp_grad.backward(X_train, y_train, learning_rate=0.5)
 
 # Track gradient magnitudes
 grad_magnitudes_W1.append(np.linalg.norm(dW1))
 grad_magnitudes_W2.append(np.linalg.norm(dW2))

# Plot gradient magnitudes
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(grad_magnitudes_W1, label='Layer 1 (Input → Hidden)', linewidth=2)
plt.plot(grad_magnitudes_W2, label='Layer 2 (Hidden → Output)', linewidth=2)
plt.title('Gradient Magnitude Over Training', fontsize=14, fontweight='bold')
plt.xlabel('Epoch')
plt.ylabel('Gradient Norm')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(grad_magnitudes_W1[50:], label='Layer 1', linewidth=2)
plt.plot(grad_magnitudes_W2[50:], label='Layer 2', linewidth=2)
plt.title('Gradient Magnitude (After Epoch 50)', fontsize=14, fontweight='bold')
plt.xlabel('Epoch')
plt.ylabel('Gradient Norm')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Gradient flow analysis:")
print(f"Final W1 gradient magnitude: {grad_magnitudes_W1[-1]:.6f}")
print(f"Final W2 gradient magnitude: {grad_magnitudes_W2[-1]:.6f}")

## 6. Comparing Different Hidden Layer Sizes

Effect of network capacity on learning

In [None]:
# Compare different hidden layer sizes
hidden_sizes = [2, 4, 8, 16, 32]
results = []

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, h_size in enumerate(hidden_sizes):
 mlp_temp = SimpleMLP(input_size=2, hidden_size=h_size, output_size=1)
 losses_temp = mlp_temp.train(X_train, y_train, epochs=1000, learning_rate=0.5, verbose=False)
 
 y_test_pred = mlp_temp.forward(X_test)
 test_acc = np.mean((y_test_pred > 0.5) == y_test)
 results.append({'hidden_size': h_size, 'test_accuracy': test_acc, 'final_loss': losses_temp[-1]})
 
 # Plot decision boundary
 h = 0.02
 x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
 y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
 Z = mlp_temp.forward(np.c_[xx.ravel(), yy.ravel()])
 Z = Z.reshape(xx.shape)
 
 axes[idx].contourf(xx, yy, Z, levels=20, cmap='RdBu', alpha=0.6)
 axes[idx].scatter(X_test[y_test.ravel()==0, 0], X_test[y_test.ravel()==0, 1], 
 c='blue', edgecolors='k', s=40, alpha=0.8)
 axes[idx].scatter(X_test[y_test.ravel()==1, 0], X_test[y_test.ravel()==1, 1], 
 c='red', edgecolors='k', s=40, alpha=0.8)
 axes[idx].set_title(f'Hidden Size = {h_size}\nAccuracy = {test_acc:.3f}', 
 fontsize=12, fontweight='bold')
 axes[idx].set_xlabel('Feature 1')
 axes[idx].set_ylabel('Feature 2')

# Remove extra subplot
axes[5].axis('off')

plt.tight_layout()
plt.show()

# Print results
print("\nComparison of Hidden Layer Sizes:")
print("="*50)
for r in results:
 print(f"Hidden Size: {r['hidden_size']:2d} | Test Acc: {r['test_accuracy']:.4f} | Loss: {r['final_loss']:.4f}")

## 7. Backpropagation Step-by-Step Example

Manual calculation for a single training example

In [None]:
# Create a tiny network for demonstration
print("Manual Backpropagation Calculation")
print("="*60)

# Initialize small weights for clarity
W1 = np.array([[0.5, 0.2], [0.1, 0.3]])
b1 = np.array([[0.1, 0.2]])
W2 = np.array([[0.4], [0.6]])
b2 = np.array([[0.1]])

# Single input example
x = np.array([[1.0, 0.5]])
y = np.array([[1.0]])

print("\nInput: x =", x[0])
print("Target: y =", y[0, 0])

# Forward pass
print("\n--- Forward Pass ---")
Z1 = np.dot(x, W1) + b1
print(f"Z1 = x @ W1 + b1 = {Z1[0]}")

A1 = tanh(Z1)
print(f"A1 = tanh(Z1) = {A1[0]}")

Z2 = np.dot(A1, W2) + b2
print(f"Z2 = A1 @ W2 + b2 = {Z2[0, 0]:.4f}")

A2 = sigmoid(Z2)
print(f"A2 = sigmoid(Z2) = {A2[0, 0]:.4f}")

loss = -(y * np.log(A2 + 1e-8) + (1 - y) * np.log(1 - A2 + 1e-8))
print(f"\nLoss = {loss[0, 0]:.4f}")

# Backward pass
print("\n--- Backward Pass ---")
dZ2 = A2 - y
print(f"dZ2 = A2 - y = {dZ2[0, 0]:.4f}")

dW2 = np.dot(A1.T, dZ2)
print(f"dW2 = A1.T @ dZ2 = {dW2.ravel()}")

db2 = dZ2
print(f"db2 = {db2[0, 0]:.4f}")

dA1 = np.dot(dZ2, W2.T)
print(f"dA1 = dZ2 @ W2.T = {dA1[0]}")

dZ1 = dA1 * tanh_derivative(Z1)
print(f"dZ1 = dA1 * tanh'(Z1) = {dZ1[0]}")

dW1 = np.dot(x.T, dZ1)
print(f"dW1 = x.T @ dZ1 = \n{dW1}")

db1 = dZ1
print(f"db1 = {db1[0]}")

print("\n" + "="*60)
print("Gradients computed successfully!")

## 8. Key Takeaways

1. **Forward Pass**: Compute activations layer by layer
 - $Z^{(l)} = W^{(l)} A^{(l-1)} + b^{(l)}$
 - $A^{(l)} = \sigma(Z^{(l)})$

2. **Backward Pass**: Compute gradients using chain rule
 - Start from output: $dZ^{(L)} = A^{(L)} - y$
 - Propagate back: $dZ^{(l)} = dA^{(l)} \odot \sigma'(Z^{(l)})$

3. **Activation Functions**:
 - Sigmoid/Tanh: Can saturate, causing vanishing gradients
 - ReLU: Better gradient flow, but can have dead neurons

4. **Network Capacity**: More hidden neurons → more complex decision boundaries

5. **Gradient Magnitude**: Monitor to detect vanishing/exploding gradients