역전파 (Backpropagation)¶

신경망의 기울기를 효율적으로 계산하는 알고리즘. 연쇄 법칙을 사용하여 출력에서 입력 방향으로 기울기를 전파함.

핵심 아이디어¶

연쇄 법칙 (Chain Rule)¶

합성 함수의 미분: $$\frac{dy}{dx} = \frac{dy}{du} \cdot \frac{du}{dx}$$

신경망에서: $$\frac{\partial L}{\partial w} = \frac{\partial L}{\partial y} \cdot \frac{\partial y}{\partial z} \cdot \frac{\partial z}{\partial w}$$

backpropagation diagram 1

계산 그래프¶

backpropagation diagram 2

단층 신경망에서의 역전파¶

수식 유도¶

Forward:
z = Wx + b
a = σ(z)
L = Loss(a, y)

Backward:
∂L/∂a = Loss'(a, y)
∂L/∂z = ∂L/∂a * σ'(z)          (활성화 기울기)
∂L/∂W = ∂L/∂z * x^T             (가중치 기울기)
∂L/∂b = ∂L/∂z                   (편향 기울기)
∂L/∂x = W^T * ∂L/∂z             (입력 기울기, 다음 층으로 전파)

코드 구현¶

import numpy as np

class Layer:
    def __init__(self, input_dim, output_dim):
        self.W = np.random.randn(input_dim, output_dim) * 0.01
        self.b = np.zeros(output_dim)

        # 캐시 (역전파용)
        self.x = None
        self.z = None

        # 기울기
        self.dW = None
        self.db = None

    def forward(self, x):
        self.x = x
        self.z = x @ self.W + self.b
        return self.z

    def backward(self, dout):
        """
        dout: 상위 층에서 전파된 기울기 ∂L/∂z
        """
        batch_size = self.x.shape[0]

        # 파라미터 기울기
        self.dW = self.x.T @ dout / batch_size
        self.db = np.sum(dout, axis=0) / batch_size

        # 입력 기울기 (이전 층으로 전파)
        dx = dout @ self.W.T

        return dx

class ReLU:
    def forward(self, z):
        self.z = z
        return np.maximum(0, z)

    def backward(self, dout):
        # ReLU 미분: z > 0이면 1, 아니면 0
        return dout * (self.z > 0)

class Softmax:
    def forward(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        self.out = exp_z / np.sum(exp_z, axis=1, keepdims=True)
        return self.out

class CrossEntropyLoss:
    def forward(self, probs, labels):
        self.probs = probs
        self.labels = labels
        batch_size = len(labels)

        # 로그 확률의 음수 평균
        log_probs = -np.log(probs[range(batch_size), labels] + 1e-7)
        return np.mean(log_probs)

    def backward(self):
        batch_size = len(self.labels)
        dx = self.probs.copy()
        dx[range(batch_size), self.labels] -= 1
        return dx / batch_size

다층 신경망에서의 역전파¶

class MLP:
    def __init__(self, layer_dims):
        self.layers = []
        self.activations = []

        for i in range(len(layer_dims) - 1):
            self.layers.append(Layer(layer_dims[i], layer_dims[i+1]))
            if i < len(layer_dims) - 2:
                self.activations.append(ReLU())

    def forward(self, x):
        for layer, activation in zip(self.layers[:-1], self.activations):
            x = layer.forward(x)
            x = activation.forward(x)
        x = self.layers[-1].forward(x)
        return x

    def backward(self, dout):
        # 역순으로 전파
        dout = self.layers[-1].backward(dout)

        for layer, activation in zip(
            reversed(self.layers[:-1]), 
            reversed(self.activations)
        ):
            dout = activation.backward(dout)
            dout = layer.backward(dout)

    def update(self, learning_rate):
        for layer in self.layers:
            layer.W -= learning_rate * layer.dW
            layer.b -= learning_rate * layer.db

# 학습
model = MLP([784, 256, 128, 10])
softmax = Softmax()
criterion = CrossEntropyLoss()

for epoch in range(epochs):
    for x_batch, y_batch in dataloader:
        # Forward
        logits = model.forward(x_batch)
        probs = softmax.forward(logits)
        loss = criterion.forward(probs, y_batch)

        # Backward
        dout = criterion.backward()
        model.backward(dout)

        # Update
        model.update(learning_rate=0.01)

PyTorch Autograd¶

자동 미분 시스템.

기본 사용¶

import torch

# 기울기 추적 활성화
x = torch.tensor([2.0], requires_grad=True)
w = torch.tensor([3.0], requires_grad=True)
b = torch.tensor([1.0], requires_grad=True)

# Forward
z = w * x + b
y = torch.relu(z)
loss = (y - 5) ** 2

# Backward (자동 미분)
loss.backward()

print(f"dL/dw: {w.grad}")  # 8.0
print(f"dL/db: {b.grad}")  # 4.0
print(f"dL/dx: {x.grad}")  # 12.0

계산 그래프 시각화¶

# torchviz로 시각화
from torchviz import make_dot

x = torch.randn(1, 10, requires_grad=True)
model = nn.Linear(10, 5)
y = model(x)
loss = y.sum()

make_dot(loss, params={'x': x, **dict(model.named_parameters())}).render('graph', format='png')

기울기 제어¶

# 기울기 비활성화
with torch.no_grad():
    output = model(input)  # 기울기 추적 안 함

# 특정 파라미터 고정
for param in model.encoder.parameters():
    param.requires_grad = False

# 기울기 초기화
optimizer.zero_grad()
# 또는
model.zero_grad()

# 기울기 누적 (gradient accumulation)
for i, batch in enumerate(dataloader):
    loss = model(batch)
    loss = loss / accumulation_steps
    loss.backward()

    if (i + 1) % accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()

기울기 문제¶

기울기 소실 (Vanishing Gradient)¶

깊은 네트워크에서 기울기가 0에 수렴.

원인:
- Sigmoid/Tanh의 포화 영역
- 0 < 기울기 < 1의 반복 곱

해결:
- ReLU 계열 활성화 함수
- 잔차 연결 (Residual Connection)
- 배치 정규화
- 적절한 초기화

기울기 폭발 (Exploding Gradient)¶

기울기가 무한대로 발산.

# 해결: Gradient Clipping
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
# 또는
torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=0.5)

# 학습 루프에서
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()

죽은 ReLU (Dead ReLU)¶

음수 입력에서 기울기가 0이 되어 학습 중단.

# 해결: Leaky ReLU, PReLU, ELU
nn.LeakyReLU(negative_slope=0.01)
nn.PReLU()
nn.ELU()

잔차 연결 (Residual Connection)¶

기울기가 직접 전파되는 경로 제공.

class ResidualBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.linear1 = nn.Linear(dim, dim)
        self.linear2 = nn.Linear(dim, dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        residual = x
        out = self.relu(self.linear1(x))
        out = self.linear2(out)
        return self.relu(out + residual)  # Skip connection

# 기울기 흐름
# ∂L/∂x = ∂L/∂y * (∂y/∂F + 1)
# 1이 추가되어 기울기 소실 방지

이중 역전파 (Double Backprop)¶

2차 미분 계산.

x = torch.randn(10, requires_grad=True)
y = x ** 3

# 1차 미분
grad = torch.autograd.grad(y.sum(), x, create_graph=True)[0]

# 2차 미분
grad2 = torch.autograd.grad(grad.sum(), x)[0]

print(f"dy/dx = 3x² = {3 * x ** 2}")
print(f"Computed: {grad}")
print(f"d²y/dx² = 6x = {6 * x}")
print(f"Computed: {grad2}")

커스텀 Autograd 함수¶

class CustomReLU(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

# 사용
custom_relu = CustomReLU.apply
y = custom_relu(x)

수치적 기울기 검증¶

def numerical_gradient(f, x, eps=1e-5):
    """수치적 기울기 계산 (검증용)"""
    grad = np.zeros_like(x)

    for idx in np.ndindex(*x.shape):
        x_plus = x.copy()
        x_plus[idx] += eps
        x_minus = x.copy()
        x_minus[idx] -= eps

        grad[idx] = (f(x_plus) - f(x_minus)) / (2 * eps)

    return grad

def gradient_check(analytic_grad, numerical_grad):
    """기울기 검증"""
    diff = np.abs(analytic_grad - numerical_grad)
    relative_error = np.max(diff / (np.abs(analytic_grad) + np.abs(numerical_grad) + 1e-7))

    if relative_error < 1e-5:
        print(f"Gradient check passed! Relative error: {relative_error}")
    else:
        print(f"Gradient check failed! Relative error: {relative_error}")

    return relative_error

실무 가이드¶

역전파가 중요한 이유¶

모델 학습 = 파라미터 최적화
최적화 = 기울기 필요
기울기 = 역전파로 계산

역전파 없이는:
- 수치적 미분: O(파라미터 수)번의 순전파
- GPT-3 (175B 파라미터): 실질적으로 불가능

역전파 사용:
- 1번의 순전파 + 1번의 역전파로 모든 기울기 계산

메모리 최적화 기법¶

# 1. Gradient Checkpointing
# 메모리 절약을 위해 중간 활성화를 저장하지 않고 역전파 시 재계산
from torch.utils.checkpoint import checkpoint

class EfficientTransformerBlock(nn.Module):
    def __init__(self, ...):
        super().__init__()
        self.attn = MultiHeadAttention(...)
        self.ffn = FeedForward(...)

    def forward(self, x, use_checkpoint=True):
        if use_checkpoint and self.training:
            # 역전파 시 재계산 (메모리↓, 연산↑)
            x = x + checkpoint(self.attn, x)
            x = x + checkpoint(self.ffn, x)
        else:
            x = x + self.attn(x)
            x = x + self.ffn(x)
        return x

# 2. Mixed Precision Training
# FP16으로 활성화 저장 → 메모리 절반
from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()
with autocast():
    output = model(input)
    loss = criterion(output, target)
scaler.scale(loss).backward()

기울기 흐름 모니터링¶

class GradientMonitor:
    """학습 중 기울기 흐름 모니터링"""

    def __init__(self, model):
        self.model = model
        self.grad_history = []

    def log_gradients(self):
        """현재 기울기 상태 기록"""
        grad_info = {}
        for name, param in self.model.named_parameters():
            if param.grad is not None:
                grad = param.grad
                grad_info[name] = {
                    'mean': grad.mean().item(),
                    'std': grad.std().item(),
                    'max': grad.abs().max().item(),
                    'norm': grad.norm().item()
                }
        self.grad_history.append(grad_info)
        return grad_info

    def plot_gradient_flow(self, layer_filter=None):
        """기울기 흐름 시각화"""
        if not self.grad_history:
            print("No gradient history")
            return

        latest = self.grad_history[-1]
        names = []
        norms = []

        for name, info in latest.items():
            if layer_filter and layer_filter not in name:
                continue
            names.append(name.replace('.weight', '').replace('.bias', ''))
            norms.append(info['norm'])

        plt.figure(figsize=(12, 6))
        plt.bar(range(len(names)), norms)
        plt.xticks(range(len(names)), names, rotation=45, ha='right')
        plt.xlabel('Layer')
        plt.ylabel('Gradient Norm')
        plt.title('Gradient Flow')
        plt.tight_layout()
        plt.show()

    def check_health(self):
        """기울기 건강 상태 체크"""
        if not self.grad_history:
            return "No history"

        latest = self.grad_history[-1]
        issues = []

        for name, info in latest.items():
            if info['norm'] < 1e-7:
                issues.append(f"[VANISHING] {name}: norm={info['norm']:.2e}")
            if info['norm'] > 100:
                issues.append(f"[EXPLODING] {name}: norm={info['norm']:.2e}")
            if info['max'] > 10:
                issues.append(f"[LARGE] {name}: max={info['max']:.2e}")

        return issues if issues else ["All gradients healthy"]

# 사용
monitor = GradientMonitor(model)

for epoch in range(num_epochs):
    for batch in train_loader:
        loss = train_step(batch)

        # 기울기 기록
        grad_info = monitor.log_gradients()

        # 주기적으로 체크
        if step % 100 == 0:
            issues = monitor.check_health()
            for issue in issues:
                print(issue)

디버깅 가이드¶

기울기가 0인 경우¶

def debug_zero_gradients(model, loss):
    """기울기가 0인 파라미터 찾기"""
    loss.backward()

    zero_grad_params = []
    for name, param in model.named_parameters():
        if param.grad is None:
            zero_grad_params.append((name, "grad is None - not in computation graph"))
        elif param.grad.abs().max() < 1e-10:
            zero_grad_params.append((name, "grad is effectively zero"))

    if zero_grad_params:
        print("=== 기울기가 0인 파라미터 ===")
        for name, reason in zero_grad_params:
            print(f"  {name}: {reason}")
        print("\n가능한 원인:")
        print("  1. requires_grad=False로 설정됨")
        print("  2. 계산 그래프에서 분리됨 (.detach() 또는 .data 사용)")
        print("  3. 죽은 ReLU (모든 입력이 음수)")
        print("  4. Softmax 후 원-핫 타겟과 정확히 일치")
    else:
        print("모든 파라미터에 기울기 존재")

    return zero_grad_params

# 해결책
# 1. requires_grad 확인
for name, param in model.named_parameters():
    print(f"{name}: requires_grad={param.requires_grad}")

# 2. .detach() 제거 (필요한 경우)
# 잘못된 코드: output = model(input).detach()
# 올바른 코드: output = model(input)

# 3. inplace 연산 주의
# 잘못된: x.relu_()  # inplace
# 올바른: x = torch.relu(x)

기울기 폭발 디버깅¶

def debug_exploding_gradients(model, max_norm=1.0):
    """기울기 폭발 원인 분석"""

    grad_norms = {}
    for name, param in model.named_parameters():
        if param.grad is not None:
            norm = param.grad.norm().item()
            grad_norms[name] = norm

    # 정렬하여 가장 큰 기울기 찾기
    sorted_grads = sorted(grad_norms.items(), key=lambda x: x[1], reverse=True)

    print("=== 기울기 크기 순위 ===")
    for i, (name, norm) in enumerate(sorted_grads[:10]):
        status = "EXPLODING" if norm > max_norm * 10 else "HIGH" if norm > max_norm else "OK"
        print(f"{i+1}. {name}: {norm:.4f} [{status}]")

    # 권장 사항
    if sorted_grads[0][1] > max_norm * 10:
        print("\n=== 권장 사항 ===")
        print("1. 학습률 낮추기 (현재의 1/10)")
        print("2. Gradient Clipping 적용:")
        print("   torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)")
        print("3. 초기화 확인 (He 또는 Xavier)")
        print("4. BatchNorm/LayerNorm 추가")

# 자동 복구
class SafeOptimizer:
    """기울기 폭발 시 자동 복구"""

    def __init__(self, optimizer, max_grad_norm=1.0, patience=5):
        self.optimizer = optimizer
        self.max_grad_norm = max_grad_norm
        self.patience = patience
        self.explosion_count = 0

    def step(self, model):
        # 기울기 클리핑
        grad_norm = torch.nn.utils.clip_grad_norm_(
            model.parameters(), self.max_grad_norm
        )

        # 폭발 감지
        if grad_norm > self.max_grad_norm * 10:
            self.explosion_count += 1
            print(f"[WARNING] Gradient explosion detected: {grad_norm:.4f}")

            if self.explosion_count >= self.patience:
                # 학습률 감소
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] *= 0.5
                print(f"[ACTION] Learning rate halved")
                self.explosion_count = 0
        else:
            self.explosion_count = 0

        self.optimizer.step()
        return grad_norm

계산 그래프 디버깅¶

# 계산 그래프 시각화
def visualize_computation_graph(output, model_params):
    """계산 그래프 시각화 (torchviz 필요)"""
    try:
        from torchviz import make_dot
        dot = make_dot(output, params=dict(model_params))
        dot.render('computation_graph', format='png', cleanup=True)
        print("계산 그래프가 computation_graph.png로 저장됨")
    except ImportError:
        print("torchviz 설치 필요: pip install torchviz")

# 역전파 단계별 추적
def trace_backward(model, input, target, criterion):
    """역전파 단계별 추적"""

    # Hook으로 기울기 흐름 추적
    grad_flow = []

    def backward_hook(module, grad_input, grad_output):
        grad_flow.append({
            'module': module.__class__.__name__,
            'grad_out': grad_output[0].norm().item() if grad_output[0] is not None else 0,
            'grad_in': grad_input[0].norm().item() if grad_input[0] is not None else 0
        })

    hooks = []
    for module in model.modules():
        if len(list(module.children())) == 0:  # leaf modules only
            hooks.append(module.register_full_backward_hook(backward_hook))

    output = model(input)
    loss = criterion(output, target)
    loss.backward()

    for h in hooks:
        h.remove()

    print("=== 역전파 흐름 (출력 → 입력) ===")
    for i, info in enumerate(reversed(grad_flow)):
        print(f"{i}. {info['module']}: grad_out={info['grad_out']:.6f} → grad_in={info['grad_in']:.6f}")