Self-Supervised Reinforcement Learning¶

개요¶

NeurIPS 2025 Best Paper로 선정된 Self-Supervised RL 연구. 외부 보상 없이 내재적 동기(intrinsic motivation)만으로 복잡한 행동을 학습하는 방법론을 제시한다.

핵심 개념¶

기존 RL의 한계¶

┌─────────────────────────────────────────────────────────────┐
│                    기존 RL                                   │
│                                                             │
│  Agent ──▶ Action ──▶ Environment ──▶ Reward ──▶ Agent     │
│                            │                                │
│                            ▼                                │
│                    [사람이 설계한 보상]                       │
│                    - 보상 해킹                              │
│                    - 희소 보상 문제                          │
│                    - 일반화 불가                            │
└─────────────────────────────────────────────────────────────┘

┌─────────────────────────────────────────────────────────────┐
│                    Self-Supervised RL                        │
│                                                             │
│  Agent ──▶ Action ──▶ Environment                           │
│    │                      │                                 │
│    │                      ▼                                 │
│    │              [자기 생성 보상]                            │
│    │              - 호기심 (Curiosity)                       │
│    │              - 엔트로피 (Diversity)                     │
│    │              - 예측 오류                               │
│    │                      │                                 │
│    └──────────◀───────────┘                                 │
└─────────────────────────────────────────────────────────────┘

내재적 보상 유형¶

유형	아이디어	수식
Curiosity	예측 불확실성 탐색	r = \|f(s') - f̂(s')\|
Empowerment	상태 제어 능력 최대화	r = I(a; s' \| s)
Entropy	상태 방문 다양성	r = H(s)
Competence	스킬 습득	r = D_KL(p(z\|s) \|\| p(z))

프레임워크¶

1. Random Network Distillation (RND)¶

import torch
import torch.nn as nn
import torch.nn.functional as F

class RNDNetwork(nn.Module):
    """Random Network Distillation for Curiosity"""

    def __init__(self, obs_dim, hidden_dim=256, output_dim=128):
        super().__init__()

        # 고정 타겟 네트워크 (랜덤 초기화)
        self.target = nn.Sequential(
            nn.Linear(obs_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
        for param in self.target.parameters():
            param.requires_grad = False

        # 학습 예측 네트워크
        self.predictor = nn.Sequential(
            nn.Linear(obs_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, obs):
        target_features = self.target(obs)
        predicted_features = self.predictor(obs)
        return target_features, predicted_features

    def intrinsic_reward(self, obs):
        """내재적 보상 계산"""
        target_features, predicted_features = self.forward(obs)

        # 예측 오류 = 호기심 보상
        reward = F.mse_loss(predicted_features, target_features, reduction='none')
        reward = reward.mean(dim=-1)

        return reward

2. ICM (Intrinsic Curiosity Module)¶

class ICM(nn.Module):
    """Intrinsic Curiosity Module"""

    def __init__(self, obs_dim, action_dim, hidden_dim=256):
        super().__init__()

        # 상태 인코더
        self.encoder = nn.Sequential(
            nn.Linear(obs_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        )

        # 역 모델: (s, s') -> a 예측
        self.inverse_model = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim)
        )

        # 순방향 모델: (s, a) -> s' 예측
        self.forward_model = nn.Sequential(
            nn.Linear(hidden_dim + action_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        )

    def forward(self, obs, action, next_obs):
        # 상태 인코딩
        phi_s = self.encoder(obs)
        phi_s_next = self.encoder(next_obs)

        # 역모델 (액션 예측)
        concat = torch.cat([phi_s, phi_s_next], dim=-1)
        pred_action = self.inverse_model(concat)

        # 순방향 모델 (다음 상태 예측)
        action_embed = F.one_hot(action, self.action_dim).float()
        forward_input = torch.cat([phi_s, action_embed], dim=-1)
        pred_phi_s_next = self.forward_model(forward_input)

        return pred_action, pred_phi_s_next, phi_s_next

    def intrinsic_reward(self, obs, action, next_obs):
        """순방향 예측 오류 = 내재적 보상"""
        _, pred_phi_s_next, phi_s_next = self.forward(obs, action, next_obs)

        reward = 0.5 * F.mse_loss(pred_phi_s_next, phi_s_next.detach(), reduction='none')
        return reward.mean(dim=-1)

3. DIAYN (Diversity is All You Need)¶

class DIAYN(nn.Module):
    """스킬 기반 Self-Supervised RL"""

    def __init__(self, obs_dim, action_dim, n_skills, hidden_dim=256):
        super().__init__()
        self.n_skills = n_skills

        # 정책: (s, z) -> a
        self.policy = nn.Sequential(
            nn.Linear(obs_dim + n_skills, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim)
        )

        # 스킬 판별기: s -> z
        self.discriminator = nn.Sequential(
            nn.Linear(obs_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, n_skills)
        )

    def select_skill(self):
        """균등 분포에서 스킬 샘플링"""
        return torch.randint(0, self.n_skills, (1,))

    def get_action(self, obs, skill):
        skill_onehot = F.one_hot(skill, self.n_skills).float()
        policy_input = torch.cat([obs, skill_onehot], dim=-1)
        return self.policy(policy_input)

    def intrinsic_reward(self, obs, skill):
        """스킬 구분 가능성 = 내재적 보상"""
        # log q(z|s) - log p(z)
        logits = self.discriminator(obs)
        log_q_z_s = F.log_softmax(logits, dim=-1)
        log_q_z_s_given = log_q_z_s.gather(-1, skill.unsqueeze(-1)).squeeze(-1)

        log_p_z = -np.log(self.n_skills)  # 균등 사전 분포

        reward = log_q_z_s_given - log_p_z
        return reward

학습 알고리즘¶

Combined Reward 학습¶

class SelfSupervisedAgent:
    def __init__(
        self, 
        obs_dim, 
        action_dim,
        intrinsic_coef: float = 0.01,
        extrinsic_coef: float = 1.0
    ):
        self.policy = ActorCritic(obs_dim, action_dim)
        self.intrinsic_module = RNDNetwork(obs_dim)
        self.intrinsic_coef = intrinsic_coef
        self.extrinsic_coef = extrinsic_coef

        self.optimizer = torch.optim.Adam([
            {'params': self.policy.parameters()},
            {'params': self.intrinsic_module.predictor.parameters()}
        ])

    def compute_reward(self, obs, extrinsic_reward):
        """통합 보상 계산"""
        intrinsic_reward = self.intrinsic_module.intrinsic_reward(obs)

        # 내재적 보상 정규화 (running mean/std)
        intrinsic_reward = self.normalize_intrinsic(intrinsic_reward)

        total_reward = (
            self.extrinsic_coef * extrinsic_reward +
            self.intrinsic_coef * intrinsic_reward
        )

        return total_reward, intrinsic_reward

    def update(self, trajectories):
        """PPO + RND 업데이트"""

        # 보상 계산
        rewards = []
        for traj in trajectories:
            obs = traj['obs']
            ext_reward = traj['reward']
            total_reward, _ = self.compute_reward(obs, ext_reward)
            rewards.append(total_reward)

        # PPO 업데이트
        ppo_loss = self.compute_ppo_loss(trajectories, rewards)

        # RND 업데이트
        rnd_loss = self.compute_rnd_loss(trajectories)

        total_loss = ppo_loss + rnd_loss

        self.optimizer.zero_grad()
        total_loss.backward()
        self.optimizer.step()

Intrinsic Reward Normalization¶

class RunningMeanStd:
    """내재적 보상 정규화를 위한 통계"""

    def __init__(self, epsilon=1e-4):
        self.mean = 0.0
        self.var = 1.0
        self.count = epsilon

    def update(self, x):
        batch_mean = x.mean()
        batch_var = x.var()
        batch_count = len(x)

        delta = batch_mean - self.mean
        total_count = self.count + batch_count

        self.mean = self.mean + delta * batch_count / total_count
        m_a = self.var * self.count
        m_b = batch_var * batch_count
        M2 = m_a + m_b + delta**2 * self.count * batch_count / total_count
        self.var = M2 / total_count
        self.count = total_count

    def normalize(self, x):
        return (x - self.mean) / (np.sqrt(self.var) + 1e-8)

NeurIPS 2025 Best Paper 핵심 기여¶

1. 계층적 Self-Supervised RL¶

┌─────────────────────────────────────────────────┐
│                High-Level Policy                 │
│         (스킬/서브골 선택)                        │
│                    │                            │
│         z₁   z₂   z₃   z₄   ...                 │
│                    │                            │
└────────────────────┼────────────────────────────┘
                     │
                     ▼
┌─────────────────────────────────────────────────┐
│              Low-Level Policies                  │
│         (각 스킬별 원시 행동)                     │
│                                                 │
│   π(a|s,z₁)  π(a|s,z₂)  π(a|s,z₃)  ...         │
└─────────────────────────────────────────────────┘

2. 자기 학습 (Bootstrapping)¶

class BootstrappedSSRL:
    """자기 생성 목표로 학습"""

    def __init__(self, obs_dim, action_dim, goal_dim):
        self.policy = GoalConditionedPolicy(obs_dim, action_dim, goal_dim)
        self.goal_generator = GoalGenerator(obs_dim, goal_dim)
        self.goal_discriminator = GoalDiscriminator(obs_dim, goal_dim)

    def generate_curriculum(self, current_state, difficulty_level):
        """난이도 적응형 목표 생성"""

        # 현재 도달 가능 범위 기반 목표 생성
        candidate_goals = self.goal_generator(current_state, n=100)

        # 난이도 필터링
        reachability = self.goal_discriminator(current_state, candidate_goals)

        # 적절한 난이도의 목표 선택
        # (너무 쉽지도, 너무 어렵지도 않은)
        target_difficulty = 0.5 + difficulty_level * 0.3
        difficulties = torch.abs(reachability - target_difficulty)

        best_idx = difficulties.argmin()
        return candidate_goals[best_idx]

3. 스케일링 법칙¶

컴퓨팅 스케일	탐색 효율	다운스트림 성능
1x	기준	기준
10x	2.3x	1.8x
100x	5.1x	3.2x
1000x	9.8x	5.4x

→ Self-supervised pretraining은 더 나은 스케일링 특성 보임

벤치마크 결과¶

Atari (No Reward)¶

방법	Human Normalized Score
Random	0%
PPO + RND	32%
ICM	28%
DIAYN	21%
NeurIPS 2025 (Ours)	58%

DMControl Suite¶

Task: walker_walk
────────────────────────────────────────────────────
Method           │ Pretraining │ Fine-tuning │ Total
────────────────────────────────────────────────────
SAC (scratch)    │     -       │    1M       │ 850
DrQ-v2           │     -       │    1M       │ 920
URLB (pretrain)  │   500K      │   100K      │ 950
Ours (pretrain)  │   500K      │   100K      │ 980
────────────────────────────────────────────────────

실제 적용¶

1. 로봇 조작¶

class RobotManipulationSSRL:
    """로봇 물체 조작 사전학습"""

    def __init__(self):
        self.skill_discovery = DIAYN(obs_dim=48, action_dim=7, n_skills=50)
        self.world_model = WorldModel(obs_dim=48, action_dim=7)

    def pretrain(self, env, steps=1_000_000):
        """자율 탐색으로 사전학습"""

        for step in range(steps):
            # 랜덤 스킬 선택
            skill = self.skill_discovery.select_skill()

            # 스킬 실행 (100스텝)
            for _ in range(100):
                action = self.skill_discovery.get_action(obs, skill)
                next_obs, _, done, _ = env.step(action)

                # 내재적 보상으로 스킬 학습
                reward = self.skill_discovery.intrinsic_reward(obs, skill)

                # World model 학습
                self.world_model.update(obs, action, next_obs)

                obs = next_obs
                if done:
                    obs = env.reset()
                    break

    def finetune(self, task_reward_fn, steps=10_000):
        """특정 태스크로 미세조정"""
        # 사전학습된 스킬 + world model 활용
        pass

2. 게임 AI¶

Montezuma's Revenge: 희소 보상 환경
Go-Explore + RND 조합
인간 수준 성능 달성

코드 참조¶

# 전체 학습 루프
def train_self_supervised_rl(
    env,
    agent: SelfSupervisedAgent,
    total_steps: int = 10_000_000,
    intrinsic_coef_schedule: str = 'constant'
):
    obs = env.reset()
    episode_intrinsic_rewards = []

    for step in range(total_steps):
        # 행동 선택
        action = agent.select_action(obs)
        next_obs, extrinsic_reward, done, info = env.step(action)

        # 통합 보상 계산
        total_reward, intrinsic_reward = agent.compute_reward(obs, extrinsic_reward)
        episode_intrinsic_rewards.append(intrinsic_reward.item())

        # 버퍼에 저장
        agent.buffer.add(obs, action, total_reward, next_obs, done)

        # 주기적 업데이트
        if step % 256 == 0:
            agent.update()

            # 내재적 보상 계수 스케줄링
            if intrinsic_coef_schedule == 'decay':
                agent.intrinsic_coef *= 0.9999

        # 로깅
        if done:
            print(f"Step {step}: Avg Intrinsic Reward = {np.mean(episode_intrinsic_rewards):.4f}")
            episode_intrinsic_rewards = []
            obs = env.reset()
        else:
            obs = next_obs

    return agent

요약¶

핵심 포인트¶

외부 보상 불필요: 내재적 동기만으로 유의미한 행동 학습
탐색 효율: Curiosity, diversity 기반 효율적 탐색
전이 학습: 사전학습 → 미세조정 패러다임
스케일링: 컴퓨팅 증가에 따른 좋은 스케일링 특성

방법 선택 가이드¶

상황	추천 방법
희소 보상	RND, ICM
스킬 발견	DIAYN, VIC
로봇 학습	Hierarchical + Goal-conditioned
게임	RND + PPO

참고 자료¶

NeurIPS 2025 Best Paper - Self-Supervised RL
Curiosity-driven Exploration
DIAYN Paper
RND Paper

마지막 업데이트: 2026-03-04