SR-ARPOD/stellar/arpodenvs/reward_shaping.py

"""
Reward shaping for physics-embedded safe residual RL.

Implements the reward from Section 3.5 of the paper:
    r_k = -(e_k^T Q e_k + u_k^T R u_k + lambda_p ||u_k - u_ref_k||^2)

The third term penalizes safety filter corrections to encourage the policy
to internalize safety boundary geometry.

Reference: Sections 2.3 and 3.5.
"""

import numpy as np


class RewardShaping:
    """
    Computes single-step reward for the terminal rendezvous task.

    Parameters
    ----------
    Q : (6,6) ndarray
        State error weight matrix (PSD).
    R : (3,3) ndarray
        Control input weight matrix (PD).
    lambda_p : float
        Safety filter correction penalty weight.
    x_h : (6,) ndarray
        Hold point equilibrium state.
    bonus_dock : float
        Terminal docking reward.
    penalty_collision : float
        Collision penalty.
    """

    def __init__(self, Q=None, R=None, lambda_p=0.1, x_h=None,
                 bonus_dock=500_000.0, penalty_collision=-250_000.0,
                 front_soft_coef=20.0, front_soft_k=0.1,
                 shield_penalty_B=100.0, shield_penalty_C=1000.0,
                 filter_cost_cap=None):
        self.Q = Q if Q is not None else np.diag(
            [0.35, 0.15, 0.35, 0.015, 0.035, 0.015])
        self.R = R if R is not None else np.eye(3) * 0.1
        self.lambda_p = lambda_p
        self.x_h = x_h if x_h is not None else np.array(
            [0.0, -60.0, 0.0, 0.0, 0.0, 0.0])
        self.bonus_dock = bonus_dock
        self.penalty_collision = penalty_collision
        self.front_soft_coef = front_soft_coef
        self.front_soft_k = front_soft_k
        self.shield_penalty_B = shield_penalty_B
        self.shield_penalty_C = shield_penalty_C
        self.filter_cost_cap = filter_cost_cap

    @staticmethod
    def _softplus(x):
        return np.log1p(np.exp(-np.abs(x))) + np.maximum(x, 0.0)

    def compute_reward(self, x, u_applied, u_ref,
                       y_front_limit=0.0, front_margin=1.0,
                       shield_level='A'):
        """
        r_k = -(e^T Q e + u^T R u + lambda_p ||u - u_ref||^2)

        Parameters
        ----------
        x : (6,) — current state
        u_applied : (3,) — control actually applied (after safety filter)
        u_ref : (3,) — reference control (before safety filter)

        Returns
        -------
        reward : float
        info : dict
        """
        e = x - self.x_h
        state_cost = e @ self.Q @ e
        control_cost = u_applied @ self.R @ u_applied
        delta_u_sq = np.sum((u_applied - u_ref) ** 2)
        filter_cost_capped = False
        if self.filter_cost_cap is not None and self.filter_cost_cap > 0.0:
            if delta_u_sq > self.filter_cost_cap:
                delta_u_sq = self.filter_cost_cap
                filter_cost_capped = True
        filter_cost = self.lambda_p * delta_u_sq

        y_limit = y_front_limit - front_margin
        y_excess = x[1] - y_limit
        front_cost = self.front_soft_coef * self._softplus(self.front_soft_k * y_excess)

        shield_cost = 0.0
        if shield_level == 'B':
            shield_cost = self.shield_penalty_B
        elif shield_level == 'C':
            shield_cost = self.shield_penalty_C

        reward = -(state_cost + control_cost + filter_cost + front_cost + shield_cost)

        # Approach shaping: reward slow approach in terminal zone (<100m)
        pos_err = np.linalg.norm(e[:3])
        vel_norm = np.linalg.norm(e[3:])
        approach_bonus = 0.0
        if pos_err < 100.0:
            approach_bonus = 2.0 * (100.0 - pos_err) / 100.0 * max(0.0, 2.0 - vel_norm)
            reward += approach_bonus

        info = {
            'state_cost': state_cost,
            'control_cost': control_cost,
            'filter_cost': filter_cost,
            'filter_delta_u_sq': delta_u_sq,
            'filter_cost_capped': filter_cost_capped,
            'front_cost': front_cost,
            'shield_cost': shield_cost,
            'approach_bonus': approach_bonus,
            'total_cost': state_cost + control_cost + filter_cost + front_cost + shield_cost,
        }
        return reward, info

    def check_docked(self, x, pos_tol=10.0, vel_tol=2.0):
        """Check if spacecraft has reached hold point within tolerance."""
        e = x - self.x_h
        pos_err = np.linalg.norm(e[:3])
        vel_err = np.linalg.norm(e[3:])
        return pos_err < pos_tol and vel_err < vel_tol

    def check_collision(self, x, rho_safe=60.0):
        """Check if collision constraint is violated."""
        return np.linalg.norm(x[:3]) < rho_safe