""" Reward shaping for physics-embedded safe residual RL. Implements the reward from Section 3.5 of the paper: r_k = -(e_k^T Q e_k + u_k^T R u_k + lambda_p ||u_k - u_ref_k||^2) The third term penalizes safety filter corrections to encourage the policy to internalize safety boundary geometry. Reference: Sections 2.3 and 3.5. """ import numpy as np class RewardShaping: """ Computes single-step reward for the terminal rendezvous task. Parameters ---------- Q : (6,6) ndarray State error weight matrix (PSD). R : (3,3) ndarray Control input weight matrix (PD). lambda_p : float Safety filter correction penalty weight. x_h : (6,) ndarray Hold point equilibrium state. bonus_dock : float Terminal docking reward. penalty_collision : float Collision penalty. """ def __init__(self, Q=None, R=None, lambda_p=0.1, x_h=None, bonus_dock=500_000.0, penalty_collision=-250_000.0, front_soft_coef=20.0, front_soft_k=0.1, shield_penalty_B=100.0, shield_penalty_C=1000.0, filter_cost_cap=None): self.Q = Q if Q is not None else np.diag( [0.35, 0.15, 0.35, 0.015, 0.035, 0.015]) self.R = R if R is not None else np.eye(3) * 0.1 self.lambda_p = lambda_p self.x_h = x_h if x_h is not None else np.array( [0.0, -60.0, 0.0, 0.0, 0.0, 0.0]) self.bonus_dock = bonus_dock self.penalty_collision = penalty_collision self.front_soft_coef = front_soft_coef self.front_soft_k = front_soft_k self.shield_penalty_B = shield_penalty_B self.shield_penalty_C = shield_penalty_C self.filter_cost_cap = filter_cost_cap @staticmethod def _softplus(x): return np.log1p(np.exp(-np.abs(x))) + np.maximum(x, 0.0) def compute_reward(self, x, u_applied, u_ref, y_front_limit=0.0, front_margin=1.0, shield_level='A'): """ r_k = -(e^T Q e + u^T R u + lambda_p ||u - u_ref||^2) Parameters ---------- x : (6,) — current state u_applied : (3,) — control actually applied (after safety filter) u_ref : (3,) — reference control (before safety filter) Returns ------- reward : float info : dict """ e = x - self.x_h state_cost = e @ self.Q @ e control_cost = u_applied @ self.R @ u_applied delta_u_sq = np.sum((u_applied - u_ref) ** 2) filter_cost_capped = False if self.filter_cost_cap is not None and self.filter_cost_cap > 0.0: if delta_u_sq > self.filter_cost_cap: delta_u_sq = self.filter_cost_cap filter_cost_capped = True filter_cost = self.lambda_p * delta_u_sq y_limit = y_front_limit - front_margin y_excess = x[1] - y_limit front_cost = self.front_soft_coef * self._softplus(self.front_soft_k * y_excess) shield_cost = 0.0 if shield_level == 'B': shield_cost = self.shield_penalty_B elif shield_level == 'C': shield_cost = self.shield_penalty_C reward = -(state_cost + control_cost + filter_cost + front_cost + shield_cost) # Approach shaping: reward slow approach in terminal zone (<100m) pos_err = np.linalg.norm(e[:3]) vel_norm = np.linalg.norm(e[3:]) approach_bonus = 0.0 if pos_err < 100.0: approach_bonus = 2.0 * (100.0 - pos_err) / 100.0 * max(0.0, 2.0 - vel_norm) reward += approach_bonus info = { 'state_cost': state_cost, 'control_cost': control_cost, 'filter_cost': filter_cost, 'filter_delta_u_sq': delta_u_sq, 'filter_cost_capped': filter_cost_capped, 'front_cost': front_cost, 'shield_cost': shield_cost, 'approach_bonus': approach_bonus, 'total_cost': state_cost + control_cost + filter_cost + front_cost + shield_cost, } return reward, info def check_docked(self, x, pos_tol=10.0, vel_tol=2.0): """Check if spacecraft has reached hold point within tolerance.""" e = x - self.x_h pos_err = np.linalg.norm(e[:3]) vel_err = np.linalg.norm(e[3:]) return pos_err < pos_tol and vel_err < vel_tol def check_collision(self, x, rho_safe=60.0): """Check if collision constraint is violated.""" return np.linalg.norm(x[:3]) < rho_safe