""" Physics-Embedded Safe Residual RL Gymnasium Environment. Architecture (Section 3.1): u_k = Pi_safe(x_hat, u_ref) u_ref = u_nom + u_res where: u_nom = -K (x_hat - x_h) (LQR nominal controller) u_res ~ pi_theta(· | s_k) (LTC residual unit / policy action) Pi_safe = HOCBF-QP safety filter Observation space: [e_hat (6), obs_noise (6)] = 12-dim Action space: residual control u_res (3-dim) The environment internally applies the nominal controller and safety filter, so the RL agent only outputs the residual correction. Reference: Sections 3.1–3.5. """ import numpy as np import gymnasium as gym from gymnasium import spaces from stellar.arpodenvs.dynamics import CWDynamics, NominalLQR, KalmanFilter from stellar.arpodenvs.safety_filter import HOCBFSafetyFilter from stellar.arpodenvs.reward_shaping import RewardShaping class SafeResidualARPOD(gym.Env): """ Gymnasium environment for physics-embedded safe residual RL terminal rendezvous. Parameters ---------- config : dict or None Configuration overrides. """ metadata = {"render_modes": ["human"]} REASON_CODES = { 'none': 0, 'success': 1, 'collision': 2, 'front_blocked': 3, 'qp_infeasible': 4, 'time_limit': 5, } # Default configuration DEFAULT_CONFIG = { # Orbital parameters 'mu': 3.986004418e14, 'R0': 42164000.0, 'dt': 1.0, # Disturbance 'd_bar': 0.01, # Hold point (negative y = behind target along V-bar) 'rho_h': 60.0, 'x_h': np.array([0.0, -60.0, 0.0, 0.0, 0.0, 0.0]), # Safety parameters 'rho_safe': 15.0, 'theta_los_deg': 60.0, 'u_max': 10.0, 'kappa1': {'c': 0.5, 'a': 0.5, 'l': 0.5}, 'kappa2': {'c': 0.5, 'a': 0.5, 'l': 0.5}, 'strict_front_enforce': True, 'y_front_limit': 0.0, 'front_margin': 1.0, 'penalty_front_violation': -1_000_000.0, 'penalty_qp_infeasible': -200_000.0, 'relax_noncritical': 5.0, # LQR weights 'Q_lqr': np.diag([1.0, 1.0, 1.0, 0.1, 0.1, 0.1]), 'R_lqr': np.eye(3) * 8.0, # Reward weights 'Q_reward': np.diag([0.35, 0.15, 0.35, 0.04, 0.06, 0.04]), 'R_reward': np.eye(3) * 0.1, 'lambda_p': 0.1, 'filter_cost_cap': 2500.0, # Initial condition ranges 'init_pos_center': np.array([0.0, -800.0, 0.0]), 'init_pos_range': np.array([200.0, 150.0, 200.0]), 'init_vel_range': 2.0, # Observation noise 'obs_noise_std': 1.0, # Episode 'max_steps': 3200, 'pos_dock_tol': 10.0, 'vel_dock_tol': 2.0, # Timeout penalty 'penalty_timeout': -20_000.0, # Domain randomization 'dr_n_range': 0.05, # fractional range for n 'dr_d_bar_range': 0.02, # max d_bar for randomization # Residual action scale 'residual_scale': 2.0, } def __init__(self, config=None): super().__init__() self.cfg = dict(self.DEFAULT_CONFIG) if config is not None: self.cfg.update(config) cfg = self.cfg n_nominal = np.sqrt(cfg['mu'] / cfg['R0'] ** 3) # Core modules self.dynamics = CWDynamics( n=n_nominal, dt=cfg['dt'], d_bar=cfg['d_bar'], mu=cfg['mu'], R0=cfg['R0']) self.lqr = NominalLQR( self.dynamics, Q_lqr=cfg['Q_lqr'], R_lqr=cfg['R_lqr']) self.kf = KalmanFilter( self.dynamics, Sigma_o=np.eye(6) * cfg['obs_noise_std'] ** 2) self.safety_filter = HOCBFSafetyFilter( n=n_nominal, rho_safe=cfg['rho_safe'], theta_los=np.radians(cfg['theta_los_deg']), kappa1=cfg['kappa1'], kappa2=cfg['kappa2'], u_max=cfg['u_max'], d_bar=cfg['d_bar'], dt=cfg['dt'], y_front_limit=cfg['y_front_limit'], front_margin=cfg['front_margin'], strict_front_enforce=cfg['strict_front_enforce'], relax_noncritical=cfg['relax_noncritical']) self.reward_fn = RewardShaping( Q=cfg['Q_reward'], R=cfg['R_reward'], lambda_p=cfg['lambda_p'], x_h=cfg['x_h'], filter_cost_cap=cfg.get('filter_cost_cap')) self.x_h = cfg['x_h'] self.n_nominal = n_nominal # Gym spaces res_scale = cfg['residual_scale'] self.action_space = spaces.Box( low=-res_scale, high=res_scale, shape=(3,), dtype=np.float32) # Observation: [e_hat(6), observation(6)] = 12-dim obs_bound = 2000.0 self.observation_space = spaces.Box( low=-obs_bound, high=obs_bound, shape=(12,), dtype=np.float32) # State variables self.state = None self.current_step = 0 self.episode = 0 def _domain_randomize(self, rng): """Apply domain randomization to n and d_bar.""" cfg = self.cfg n_base = self.n_nominal dr_n = cfg['dr_n_range'] dr_d = cfg['dr_d_bar_range'] n_rand = n_base * (1.0 + rng.uniform(-dr_n, dr_n)) d_rand = rng.uniform(0, dr_d) self.dynamics.update_params(n=n_rand, d_bar=d_rand) self.lqr.update(self.dynamics) self.safety_filter.update_params(n=n_rand, d_bar=d_rand, dt=cfg['dt']) def _init_state(self, rng): """Sample initial state near hold point.""" cfg = self.cfg pos = cfg['init_pos_center'] + cfg['init_pos_range'] * rng.uniform(-1, 1, 3) vel_mag = rng.uniform(0, cfg['init_vel_range']) vel_dir = rng.standard_normal(3) vel_dir = vel_dir / (np.linalg.norm(vel_dir) + 1e-8) vel = vel_mag * vel_dir return np.concatenate([pos, vel]) def _get_obs(self): """Build observation: [e_hat, o_k].""" x_hat = self.kf.x_hat e_hat = x_hat - self.x_h # Observation with noise noise = np.random.randn(6) * self.cfg['obs_noise_std'] o_k = self.state + noise return np.concatenate([e_hat, o_k]).astype(np.float32) def reset(self, seed=None, options=None): super().reset(seed=seed) rng = self.np_random self._domain_randomize(rng) self.state = self._init_state(rng) self.kf.reset(self.state) self.current_step = 0 self.episode += 1 obs = self._get_obs() info = { 'initial_state': self.state.copy(), 'episode': self.episode, } return obs, info def step(self, action): """ Execute one step of the physics-embedded safe residual controller. action = u_res (residual from policy/agent) Internally: u_ref = u_nom + u_res, then u = Pi_safe(x_hat, u_ref) """ cfg = self.cfg u_res = np.array(action, dtype=np.float64) terminate_reason = 'none' # Nominal control x_hat = self.kf.x_hat u_nom = self.lqr.compute(x_hat, self.x_h) u_ref = u_nom + u_res # Safety filter u_safe, feasible, shield_info = self.safety_filter.filter(x_hat, u_ref) # Clamp to input constraints u_applied = np.clip(u_safe, -cfg['u_max'], cfg['u_max']) # Final execution guard (strictly disallow front crossing risk) y_limit = cfg['y_front_limit'] - cfg['front_margin'] y_next_worst_exec = self.safety_filter._predict_y_next_worst(x_hat, u_applied) front_guard_triggered = cfg['strict_front_enforce'] and (y_next_worst_exec > y_limit) if front_guard_triggered: reward, reward_info = self.reward_fn.compute_reward( self.state, np.zeros(3), u_ref, y_front_limit=cfg['y_front_limit'], front_margin=cfg['front_margin'], shield_level=shield_info.get('shield_level', 'C')) reward += cfg['penalty_front_violation'] terminated = True truncated = False terminate_reason = 'front_blocked' obs = self._get_obs() info = { **reward_info, 'u_nom': u_nom, 'u_res': u_res, 'u_ref': u_ref, 'u_applied': np.zeros(3), 'feasible': feasible, 'step': self.current_step, 'pos_err': np.linalg.norm(self.state[:3] - self.x_h[:3]), 'vel_err': np.linalg.norm(self.state[3:] - self.x_h[3:]), 'terminate_reason': terminate_reason, 'reason_code': self.REASON_CODES[terminate_reason], 'shield_level': shield_info.get('shield_level', 'C'), 'qp_success': shield_info.get('qp_success', False), 'front_margin': shield_info.get('front_margin', y_limit - y_next_worst_exec), 'y_next_worst': shield_info.get('y_next_worst', y_next_worst_exec), 'hard_front_blocked': True, 'front_guard_triggered': True, 'intervention_flag': int(shield_info.get('shield_level', 'C') != 'A'), } return obs, reward, terminated, truncated, info # Sample disturbance disturbance = self.dynamics.sample_disturbance() # State transition (exact ZOH) self.state = self.dynamics.step(self.state, u_applied, disturbance) # Kalman filter update self.kf.predict(u_applied) obs_meas = self.state + np.random.randn(6) * cfg['obs_noise_std'] self.kf.update(obs_meas) self.current_step += 1 # Reward reward, reward_info = self.reward_fn.compute_reward( self.state, u_applied, u_ref, y_front_limit=cfg['y_front_limit'], front_margin=cfg['front_margin'], shield_level=shield_info.get('shield_level', 'A')) # Termination checks terminated = False truncated = False if shield_info.get('terminate_recommended', False): reward += cfg['penalty_qp_infeasible'] terminated = True terminate_reason = shield_info.get('terminate_reason_hint', 'qp_infeasible') # Collision if (not terminated) and self.reward_fn.check_collision(self.state, cfg['rho_safe']): reward += self.reward_fn.penalty_collision terminated = True terminate_reason = 'collision' # Docking success if (not terminated) and self.reward_fn.check_docked( self.state, cfg['pos_dock_tol'], cfg['vel_dock_tol']): reward += self.reward_fn.bonus_dock truncated = True terminate_reason = 'success' # Time limit if (not terminated) and self.current_step >= cfg['max_steps']: reward += cfg.get('penalty_timeout', -20_000.0) truncated = True terminate_reason = 'time_limit' obs = self._get_obs() info = { **reward_info, 'u_nom': u_nom, 'u_res': u_res, 'u_ref': u_ref, 'u_applied': u_applied, 'feasible': feasible, 'step': self.current_step, 'pos_err': np.linalg.norm(self.state[:3] - self.x_h[:3]), 'vel_err': np.linalg.norm(self.state[3:] - self.x_h[3:]), 'terminate_reason': terminate_reason, 'reason_code': self.REASON_CODES[terminate_reason], 'shield_level': shield_info.get('shield_level', 'A'), 'qp_success': shield_info.get('qp_success', False), 'front_margin': shield_info.get('front_margin', 0.0), 'y_next_worst': shield_info.get('y_next_worst', y_next_worst_exec), 'hard_front_blocked': bool(shield_info.get('hard_front_blocked', False)), 'front_guard_triggered': False, 'intervention_flag': int(shield_info.get('shield_level', 'A') != 'A'), } return obs, reward, terminated, truncated, info def render(self, mode="human"): pass