Writing Policies¶
A guide to writing effective robot control policies for BotManifold.
Policy Structure¶
Every policy must have a policy.py file with a policy() function:
import numpy as np
def policy(observation: dict) -> np.ndarray:
"""
Your robot control policy.
Args:
observation: Dictionary of sensor data
Returns:
action: 7D numpy array of robot commands
"""
# Your logic here
return np.zeros(7)
Understanding Observations¶
Observations contain everything your policy needs to make decisions:
observation = {
# Robot state
'robot_state': np.array([...]), # Position, orientation, velocity
# Environment
'object_positions': np.array([...]), # Where objects are
'target_positions': np.array([...]), # Where objects should go
# Metadata
'time_remaining': float, # Seconds left
'gripper_state': float, # 0=closed, 1=open
}
Extracting Information¶
def policy(observation):
# Robot position (x, y, z)
robot_pos = observation['robot_state'][:3]
# Robot orientation (quaternion)
robot_quat = observation['robot_state'][3:7]
# Gripper state
gripper_open = observation['robot_state'][13] > 0.5
# All object positions
objects = observation['object_positions']
# First object position
if len(objects) > 0:
first_obj = objects[0]
Action Space¶
Actions are 7D vectors controlling the robot:
action = np.array([
vx, # X velocity (-1 to 1)
vy, # Y velocity (-1 to 1)
vz, # Z velocity (-1 to 1)
roll, # Roll rate (-1 to 1)
pitch, # Pitch rate (-1 to 1)
yaw, # Yaw rate (-1 to 1)
gripper # -1=close, 1=open
])
Common Action Patterns¶
# Move forward (positive X)
action = np.array([0.5, 0, 0, 0, 0, 0, 0])
# Move up
action = np.array([0, 0, 0.5, 0, 0, 0, 0])
# Open gripper
action = np.array([0, 0, 0, 0, 0, 0, 1])
# Close gripper
action = np.array([0, 0, 0, 0, 0, 0, -1])
# Move toward target
direction = target - robot_pos
direction = direction / (np.linalg.norm(direction) + 1e-6)
action = np.concatenate([direction * 0.5, [0, 0, 0, 0]])
State Machine Pattern¶
A common approach is to use a state machine:
import numpy as np
class PolicyState:
APPROACH = 0
GRASP = 1
LIFT = 2
MOVE = 3
PLACE = 4
RELEASE = 5
# Global state (persists across calls)
state = PolicyState.APPROACH
current_target = 0
def policy(observation):
global state, current_target
robot_pos = observation['robot_state'][:3]
gripper_open = observation['robot_state'][13] > 0.5
objects = observation['object_positions']
targets = observation['target_positions']
if len(objects) == 0:
return np.zeros(7) # Done!
obj = objects[current_target % len(objects)]
target = targets[current_target % len(targets)]
if state == PolicyState.APPROACH:
dist = np.linalg.norm(robot_pos - obj)
if dist < 0.03:
state = PolicyState.GRASP
direction = (obj - robot_pos) / (dist + 1e-6)
return np.concatenate([direction * 0.8, [0, 0, 0, 1]])
elif state == PolicyState.GRASP:
if not gripper_open:
state = PolicyState.LIFT
return np.array([0, 0, 0, 0, 0, 0, -1])
elif state == PolicyState.LIFT:
if robot_pos[2] > 0.2:
state = PolicyState.MOVE
return np.array([0, 0, 0.5, 0, 0, 0, -1])
elif state == PolicyState.MOVE:
dist = np.linalg.norm(robot_pos[:2] - target[:2])
if dist < 0.03:
state = PolicyState.PLACE
direction = (target - robot_pos)
direction[2] = 0 # Stay at current height
direction = direction / (np.linalg.norm(direction) + 1e-6)
return np.concatenate([direction * 0.8, [0, 0, 0, -1]])
elif state == PolicyState.PLACE:
if robot_pos[2] < target[2] + 0.05:
state = PolicyState.RELEASE
return np.array([0, 0, -0.3, 0, 0, 0, -1])
elif state == PolicyState.RELEASE:
current_target += 1
state = PolicyState.APPROACH
return np.array([0, 0, 0.3, 0, 0, 0, 1])
return np.zeros(7)
Best Practices¶
1. Start Simple¶
Begin with a basic policy that works, then improve:
def policy(observation):
# Just move toward first object
robot = observation['robot_state'][:3]
obj = observation['object_positions'][0]
direction = obj - robot
return np.concatenate([direction * 0.1, [0, 0, 0, 0]])
2. Handle Edge Cases¶
def policy(observation):
objects = observation['object_positions']
# No objects left
if len(objects) == 0:
return np.zeros(7)
# Check time
if observation['time_remaining'] < 5:
# Rush to complete
pass
3. Smooth Movements¶
Avoid jerky movements by limiting action magnitude:
def smooth_action(action, max_vel=0.5):
action = np.clip(action, -1, 1)
action[:6] *= max_vel
return action
4. Use Numpy Efficiently¶
# Good: vectorized operations
distances = np.linalg.norm(objects - robot_pos, axis=1)
nearest = objects[np.argmin(distances)]
# Bad: Python loops
distances = []
for obj in objects:
distances.append(np.linalg.norm(obj - robot_pos))
nearest = objects[distances.index(min(distances))]
Common Mistakes¶
1. Forgetting to Close Gripper¶
# Wrong: gripper stays open
action = np.array([0, 0, 0, 0, 0, 0, 0])
# Right: explicitly close
action = np.array([0, 0, 0, 0, 0, 0, -1])
2. Moving Too Fast¶
# Wrong: max velocity causes instability
action[:3] = direction * 1.0
# Right: moderate speed
action[:3] = direction * 0.5
3. Ignoring Orientation¶
Some tasks require specific gripper orientation:
Performance Tips¶
- Minimize computation per step: You have ~100ms per step
- Cache expensive calculations: Store results across calls
- Use numpy: Avoid Python loops for array operations
- Pre-compute constants: Calculate fixed values once
# Pre-computed constants
APPROACH_SPEED = 0.5
GRASP_THRESHOLD = 0.03
def policy(observation):
# Use constants instead of magic numbers
...
Testing Locally¶
Before submitting, test your policy:
# Dry run to check for errors
import numpy as np
# Simulate observation
test_obs = {
'robot_state': np.zeros(14),
'object_positions': np.array([[0.1, 0.2, 0.05]]),
'target_positions': np.array([[0.3, 0.3, 0.05]]),
'time_remaining': 60.0,
}
# Test your policy
action = policy(test_obs)
assert action.shape == (7,), f"Wrong shape: {action.shape}"
assert np.all(np.abs(action) <= 1), f"Actions out of range: {action}"
print("Policy test passed!")