SR-ARPOD/stellar/arpodenvs/reward_shaping.py
2026-04-01 22:48:53 +08:00

130 lines
4.6 KiB
Python

"""
Reward shaping for physics-embedded safe residual RL.
Implements the reward from Section 3.5 of the paper:
r_k = -(e_k^T Q e_k + u_k^T R u_k + lambda_p ||u_k - u_ref_k||^2)
The third term penalizes safety filter corrections to encourage the policy
to internalize safety boundary geometry.
Reference: Sections 2.3 and 3.5.
"""
import numpy as np
class RewardShaping:
"""
Computes single-step reward for the terminal rendezvous task.
Parameters
----------
Q : (6,6) ndarray
State error weight matrix (PSD).
R : (3,3) ndarray
Control input weight matrix (PD).
lambda_p : float
Safety filter correction penalty weight.
x_h : (6,) ndarray
Hold point equilibrium state.
bonus_dock : float
Terminal docking reward.
penalty_collision : float
Collision penalty.
"""
def __init__(self, Q=None, R=None, lambda_p=0.1, x_h=None,
bonus_dock=500_000.0, penalty_collision=-250_000.0,
front_soft_coef=20.0, front_soft_k=0.1,
shield_penalty_B=100.0, shield_penalty_C=1000.0,
filter_cost_cap=None):
self.Q = Q if Q is not None else np.diag(
[0.35, 0.15, 0.35, 0.015, 0.035, 0.015])
self.R = R if R is not None else np.eye(3) * 0.1
self.lambda_p = lambda_p
self.x_h = x_h if x_h is not None else np.array(
[0.0, -60.0, 0.0, 0.0, 0.0, 0.0])
self.bonus_dock = bonus_dock
self.penalty_collision = penalty_collision
self.front_soft_coef = front_soft_coef
self.front_soft_k = front_soft_k
self.shield_penalty_B = shield_penalty_B
self.shield_penalty_C = shield_penalty_C
self.filter_cost_cap = filter_cost_cap
@staticmethod
def _softplus(x):
return np.log1p(np.exp(-np.abs(x))) + np.maximum(x, 0.0)
def compute_reward(self, x, u_applied, u_ref,
y_front_limit=0.0, front_margin=1.0,
shield_level='A'):
"""
r_k = -(e^T Q e + u^T R u + lambda_p ||u - u_ref||^2)
Parameters
----------
x : (6,) — current state
u_applied : (3,) — control actually applied (after safety filter)
u_ref : (3,) — reference control (before safety filter)
Returns
-------
reward : float
info : dict
"""
e = x - self.x_h
state_cost = e @ self.Q @ e
control_cost = u_applied @ self.R @ u_applied
delta_u_sq = np.sum((u_applied - u_ref) ** 2)
filter_cost_capped = False
if self.filter_cost_cap is not None and self.filter_cost_cap > 0.0:
if delta_u_sq > self.filter_cost_cap:
delta_u_sq = self.filter_cost_cap
filter_cost_capped = True
filter_cost = self.lambda_p * delta_u_sq
y_limit = y_front_limit - front_margin
y_excess = x[1] - y_limit
front_cost = self.front_soft_coef * self._softplus(self.front_soft_k * y_excess)
shield_cost = 0.0
if shield_level == 'B':
shield_cost = self.shield_penalty_B
elif shield_level == 'C':
shield_cost = self.shield_penalty_C
reward = -(state_cost + control_cost + filter_cost + front_cost + shield_cost)
# Approach shaping: reward slow approach in terminal zone (<100m)
pos_err = np.linalg.norm(e[:3])
vel_norm = np.linalg.norm(e[3:])
approach_bonus = 0.0
if pos_err < 100.0:
approach_bonus = 2.0 * (100.0 - pos_err) / 100.0 * max(0.0, 2.0 - vel_norm)
reward += approach_bonus
info = {
'state_cost': state_cost,
'control_cost': control_cost,
'filter_cost': filter_cost,
'filter_delta_u_sq': delta_u_sq,
'filter_cost_capped': filter_cost_capped,
'front_cost': front_cost,
'shield_cost': shield_cost,
'approach_bonus': approach_bonus,
'total_cost': state_cost + control_cost + filter_cost + front_cost + shield_cost,
}
return reward, info
def check_docked(self, x, pos_tol=10.0, vel_tol=2.0):
"""Check if spacecraft has reached hold point within tolerance."""
e = x - self.x_h
pos_err = np.linalg.norm(e[:3])
vel_err = np.linalg.norm(e[3:])
return pos_err < pos_tol and vel_err < vel_tol
def check_collision(self, x, rho_safe=60.0):
"""Check if collision constraint is violated."""
return np.linalg.norm(x[:3]) < rho_safe