Writing Policies¶

A guide to writing effective robot control policies for BotManifold.

Policy Structure¶

Every policy must have a policy.py file with a policy() function:

import numpy as np

def policy(observation: dict) -> np.ndarray:
    """
    Your robot control policy.

    Args:
        observation: Dictionary of sensor data

    Returns:
        action: 7D numpy array of robot commands
    """
    # Your logic here
    return np.zeros(7)

Understanding Observations¶

Observations contain everything your policy needs to make decisions:

observation = {
    # Robot state
    'robot_state': np.array([...]),  # Position, orientation, velocity

    # Environment
    'object_positions': np.array([...]),  # Where objects are
    'target_positions': np.array([...]),  # Where objects should go

    # Metadata
    'time_remaining': float,  # Seconds left
    'gripper_state': float,   # 0=closed, 1=open
}

Extracting Information¶

def policy(observation):
    # Robot position (x, y, z)
    robot_pos = observation['robot_state'][:3]

    # Robot orientation (quaternion)
    robot_quat = observation['robot_state'][3:7]

    # Gripper state
    gripper_open = observation['robot_state'][13] > 0.5

    # All object positions
    objects = observation['object_positions']

    # First object position
    if len(objects) > 0:
        first_obj = objects[0]

Action Space¶

Actions are 7D vectors controlling the robot:

action = np.array([
    vx,      # X velocity (-1 to 1)
    vy,      # Y velocity (-1 to 1)
    vz,      # Z velocity (-1 to 1)
    roll,    # Roll rate (-1 to 1)
    pitch,   # Pitch rate (-1 to 1)
    yaw,     # Yaw rate (-1 to 1)
    gripper  # -1=close, 1=open
])

Common Action Patterns¶

# Move forward (positive X)
action = np.array([0.5, 0, 0, 0, 0, 0, 0])

# Move up
action = np.array([0, 0, 0.5, 0, 0, 0, 0])

# Open gripper
action = np.array([0, 0, 0, 0, 0, 0, 1])

# Close gripper
action = np.array([0, 0, 0, 0, 0, 0, -1])

# Move toward target
direction = target - robot_pos
direction = direction / (np.linalg.norm(direction) + 1e-6)
action = np.concatenate([direction * 0.5, [0, 0, 0, 0]])

State Machine Pattern¶

A common approach is to use a state machine:

import numpy as np

class PolicyState:
    APPROACH = 0
    GRASP = 1
    LIFT = 2
    MOVE = 3
    PLACE = 4
    RELEASE = 5

# Global state (persists across calls)
state = PolicyState.APPROACH
current_target = 0

def policy(observation):
    global state, current_target

    robot_pos = observation['robot_state'][:3]
    gripper_open = observation['robot_state'][13] > 0.5
    objects = observation['object_positions']
    targets = observation['target_positions']

    if len(objects) == 0:
        return np.zeros(7)  # Done!

    obj = objects[current_target % len(objects)]
    target = targets[current_target % len(targets)]

    if state == PolicyState.APPROACH:
        dist = np.linalg.norm(robot_pos - obj)
        if dist < 0.03:
            state = PolicyState.GRASP
        direction = (obj - robot_pos) / (dist + 1e-6)
        return np.concatenate([direction * 0.8, [0, 0, 0, 1]])

    elif state == PolicyState.GRASP:
        if not gripper_open:
            state = PolicyState.LIFT
        return np.array([0, 0, 0, 0, 0, 0, -1])

    elif state == PolicyState.LIFT:
        if robot_pos[2] > 0.2:
            state = PolicyState.MOVE
        return np.array([0, 0, 0.5, 0, 0, 0, -1])

    elif state == PolicyState.MOVE:
        dist = np.linalg.norm(robot_pos[:2] - target[:2])
        if dist < 0.03:
            state = PolicyState.PLACE
        direction = (target - robot_pos)
        direction[2] = 0  # Stay at current height
        direction = direction / (np.linalg.norm(direction) + 1e-6)
        return np.concatenate([direction * 0.8, [0, 0, 0, -1]])

    elif state == PolicyState.PLACE:
        if robot_pos[2] < target[2] + 0.05:
            state = PolicyState.RELEASE
        return np.array([0, 0, -0.3, 0, 0, 0, -1])

    elif state == PolicyState.RELEASE:
        current_target += 1
        state = PolicyState.APPROACH
        return np.array([0, 0, 0.3, 0, 0, 0, 1])

    return np.zeros(7)

Best Practices¶

1. Start Simple¶

Begin with a basic policy that works, then improve:

def policy(observation):
    # Just move toward first object
    robot = observation['robot_state'][:3]
    obj = observation['object_positions'][0]
    direction = obj - robot
    return np.concatenate([direction * 0.1, [0, 0, 0, 0]])

2. Handle Edge Cases¶

def policy(observation):
    objects = observation['object_positions']

    # No objects left
    if len(objects) == 0:
        return np.zeros(7)

    # Check time
    if observation['time_remaining'] < 5:
        # Rush to complete
        pass

3. Smooth Movements¶

Avoid jerky movements by limiting action magnitude:

def smooth_action(action, max_vel=0.5):
    action = np.clip(action, -1, 1)
    action[:6] *= max_vel
    return action

4. Use Numpy Efficiently¶

# Good: vectorized operations
distances = np.linalg.norm(objects - robot_pos, axis=1)
nearest = objects[np.argmin(distances)]

# Bad: Python loops
distances = []
for obj in objects:
    distances.append(np.linalg.norm(obj - robot_pos))
nearest = objects[distances.index(min(distances))]

Common Mistakes¶

1. Forgetting to Close Gripper¶

# Wrong: gripper stays open
action = np.array([0, 0, 0, 0, 0, 0, 0])

# Right: explicitly close
action = np.array([0, 0, 0, 0, 0, 0, -1])

2. Moving Too Fast¶

# Wrong: max velocity causes instability
action[:3] = direction * 1.0

# Right: moderate speed
action[:3] = direction * 0.5

3. Ignoring Orientation¶

Some tasks require specific gripper orientation:

# Consider gripper orientation for vertical objects
action[3:6] = desired_rotation_velocity

Performance Tips¶

Minimize computation per step: You have ~100ms per step
Cache expensive calculations: Store results across calls
Use numpy: Avoid Python loops for array operations
Pre-compute constants: Calculate fixed values once

# Pre-computed constants
APPROACH_SPEED = 0.5
GRASP_THRESHOLD = 0.03

def policy(observation):
    # Use constants instead of magic numbers
    ...

Testing Locally¶

Before submitting, test your policy:

# Dry run to check for errors
import numpy as np

# Simulate observation
test_obs = {
    'robot_state': np.zeros(14),
    'object_positions': np.array([[0.1, 0.2, 0.05]]),
    'target_positions': np.array([[0.3, 0.3, 0.05]]),
    'time_remaining': 60.0,
}

# Test your policy
action = policy(test_obs)
assert action.shape == (7,), f"Wrong shape: {action.shape}"
assert np.all(np.abs(action) <= 1), f"Actions out of range: {action}"
print("Policy test passed!")