130 lines
4.6 KiB
Python
130 lines
4.6 KiB
Python
"""
|
|
Reward shaping for physics-embedded safe residual RL.
|
|
|
|
Implements the reward from Section 3.5 of the paper:
|
|
r_k = -(e_k^T Q e_k + u_k^T R u_k + lambda_p ||u_k - u_ref_k||^2)
|
|
|
|
The third term penalizes safety filter corrections to encourage the policy
|
|
to internalize safety boundary geometry.
|
|
|
|
Reference: Sections 2.3 and 3.5.
|
|
"""
|
|
|
|
import numpy as np
|
|
|
|
|
|
class RewardShaping:
|
|
"""
|
|
Computes single-step reward for the terminal rendezvous task.
|
|
|
|
Parameters
|
|
----------
|
|
Q : (6,6) ndarray
|
|
State error weight matrix (PSD).
|
|
R : (3,3) ndarray
|
|
Control input weight matrix (PD).
|
|
lambda_p : float
|
|
Safety filter correction penalty weight.
|
|
x_h : (6,) ndarray
|
|
Hold point equilibrium state.
|
|
bonus_dock : float
|
|
Terminal docking reward.
|
|
penalty_collision : float
|
|
Collision penalty.
|
|
"""
|
|
|
|
def __init__(self, Q=None, R=None, lambda_p=0.1, x_h=None,
|
|
bonus_dock=500_000.0, penalty_collision=-250_000.0,
|
|
front_soft_coef=20.0, front_soft_k=0.1,
|
|
shield_penalty_B=100.0, shield_penalty_C=1000.0,
|
|
filter_cost_cap=None):
|
|
self.Q = Q if Q is not None else np.diag(
|
|
[0.35, 0.15, 0.35, 0.015, 0.035, 0.015])
|
|
self.R = R if R is not None else np.eye(3) * 0.1
|
|
self.lambda_p = lambda_p
|
|
self.x_h = x_h if x_h is not None else np.array(
|
|
[0.0, -60.0, 0.0, 0.0, 0.0, 0.0])
|
|
self.bonus_dock = bonus_dock
|
|
self.penalty_collision = penalty_collision
|
|
self.front_soft_coef = front_soft_coef
|
|
self.front_soft_k = front_soft_k
|
|
self.shield_penalty_B = shield_penalty_B
|
|
self.shield_penalty_C = shield_penalty_C
|
|
self.filter_cost_cap = filter_cost_cap
|
|
|
|
@staticmethod
|
|
def _softplus(x):
|
|
return np.log1p(np.exp(-np.abs(x))) + np.maximum(x, 0.0)
|
|
|
|
def compute_reward(self, x, u_applied, u_ref,
|
|
y_front_limit=0.0, front_margin=1.0,
|
|
shield_level='A'):
|
|
"""
|
|
r_k = -(e^T Q e + u^T R u + lambda_p ||u - u_ref||^2)
|
|
|
|
Parameters
|
|
----------
|
|
x : (6,) — current state
|
|
u_applied : (3,) — control actually applied (after safety filter)
|
|
u_ref : (3,) — reference control (before safety filter)
|
|
|
|
Returns
|
|
-------
|
|
reward : float
|
|
info : dict
|
|
"""
|
|
e = x - self.x_h
|
|
state_cost = e @ self.Q @ e
|
|
control_cost = u_applied @ self.R @ u_applied
|
|
delta_u_sq = np.sum((u_applied - u_ref) ** 2)
|
|
filter_cost_capped = False
|
|
if self.filter_cost_cap is not None and self.filter_cost_cap > 0.0:
|
|
if delta_u_sq > self.filter_cost_cap:
|
|
delta_u_sq = self.filter_cost_cap
|
|
filter_cost_capped = True
|
|
filter_cost = self.lambda_p * delta_u_sq
|
|
|
|
y_limit = y_front_limit - front_margin
|
|
y_excess = x[1] - y_limit
|
|
front_cost = self.front_soft_coef * self._softplus(self.front_soft_k * y_excess)
|
|
|
|
shield_cost = 0.0
|
|
if shield_level == 'B':
|
|
shield_cost = self.shield_penalty_B
|
|
elif shield_level == 'C':
|
|
shield_cost = self.shield_penalty_C
|
|
|
|
reward = -(state_cost + control_cost + filter_cost + front_cost + shield_cost)
|
|
|
|
# Approach shaping: reward slow approach in terminal zone (<100m)
|
|
pos_err = np.linalg.norm(e[:3])
|
|
vel_norm = np.linalg.norm(e[3:])
|
|
approach_bonus = 0.0
|
|
if pos_err < 100.0:
|
|
approach_bonus = 2.0 * (100.0 - pos_err) / 100.0 * max(0.0, 2.0 - vel_norm)
|
|
reward += approach_bonus
|
|
|
|
info = {
|
|
'state_cost': state_cost,
|
|
'control_cost': control_cost,
|
|
'filter_cost': filter_cost,
|
|
'filter_delta_u_sq': delta_u_sq,
|
|
'filter_cost_capped': filter_cost_capped,
|
|
'front_cost': front_cost,
|
|
'shield_cost': shield_cost,
|
|
'approach_bonus': approach_bonus,
|
|
'total_cost': state_cost + control_cost + filter_cost + front_cost + shield_cost,
|
|
}
|
|
return reward, info
|
|
|
|
def check_docked(self, x, pos_tol=10.0, vel_tol=2.0):
|
|
"""Check if spacecraft has reached hold point within tolerance."""
|
|
e = x - self.x_h
|
|
pos_err = np.linalg.norm(e[:3])
|
|
vel_err = np.linalg.norm(e[3:])
|
|
return pos_err < pos_tol and vel_err < vel_tol
|
|
|
|
def check_collision(self, x, rho_safe=60.0):
|
|
"""Check if collision constraint is violated."""
|
|
return np.linalg.norm(x[:3]) < rho_safe
|