作者: Ilias Chrysovergis
创建日期 2021/06/24
最后修改日期 2024/03/12
描述:使用近端策略优化 (PPO) 智能体实现 CartPole-v1 环境。
此代码示例使用近端策略优化 (PPO) 智能体解决 CartPole-v1 环境。
一根杆子通过一个无驱动的关节连接到一辆手推车上,手推车沿着一根无摩擦的轨道移动。系统通过对手推车施加 +1 或 -1 的力来控制。摆锤开始时处于直立状态,目标是防止它倒下。每当杆子保持直立时,都会获得 +1 的奖励。当杆子与垂直方向的夹角超过 15 度,或手推车从中心移动超过 2.4 个单位时,回合结束。200 步后回合结束。因此,我们所能获得的最高回报等于 200。
PPO 是一种策略梯度方法,可用于具有离散或连续动作空间的环境。它以在线方式训练随机策略。此外,它利用了 Actor-Critic 方法。Actor 将观察映射到动作,Critic 给出在给定观察的情况下智能体奖励的期望值。首先,它通过从随机策略的最新版本中采样,为每个 epoch 收集一组轨迹。然后,计算奖励转到和优势估计,以更新策略并拟合价值函数。策略通过随机梯度上升优化器更新,而价值函数通过一些梯度下降算法拟合。此过程应用于许多 epoch,直到环境被解决。
此代码示例使用 Keras 和 Tensorflow v2。它基于 PPO 原论文、OpenAI 的 PPO Spinning Up 文档以及 OpenAI 使用 Tensorflow v1 的 PPO 实现。
OpenAI Spinning Up Github - PPO
此示例使用以下库
numpy
用于 n 维数组tensorflow
和 keras
用于构建深度强化学习 PPO 智能体gymnasium
用于获取有关环境的所有必要信息scipy.signal
用于计算向量的折扣累积和import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import keras
from keras import layers
import numpy as np
import tensorflow as tf
import gymnasium as gym
import scipy.signal
def discounted_cumulative_sums(x, discount):
# Discounted cumulative sums of vectors for computing rewards-to-go and advantage estimates
return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
class Buffer:
# Buffer for storing trajectories
def __init__(self, observation_dimensions, size, gamma=0.99, lam=0.95):
# Buffer initialization
self.observation_buffer = np.zeros(
(size, observation_dimensions), dtype=np.float32
)
self.action_buffer = np.zeros(size, dtype=np.int32)
self.advantage_buffer = np.zeros(size, dtype=np.float32)
self.reward_buffer = np.zeros(size, dtype=np.float32)
self.return_buffer = np.zeros(size, dtype=np.float32)
self.value_buffer = np.zeros(size, dtype=np.float32)
self.logprobability_buffer = np.zeros(size, dtype=np.float32)
self.gamma, self.lam = gamma, lam
self.pointer, self.trajectory_start_index = 0, 0
def store(self, observation, action, reward, value, logprobability):
# Append one step of agent-environment interaction
self.observation_buffer[self.pointer] = observation
self.action_buffer[self.pointer] = action
self.reward_buffer[self.pointer] = reward
self.value_buffer[self.pointer] = value
self.logprobability_buffer[self.pointer] = logprobability
self.pointer += 1
def finish_trajectory(self, last_value=0):
# Finish the trajectory by computing advantage estimates and rewards-to-go
path_slice = slice(self.trajectory_start_index, self.pointer)
rewards = np.append(self.reward_buffer[path_slice], last_value)
values = np.append(self.value_buffer[path_slice], last_value)
deltas = rewards[:-1] + self.gamma * values[1:] - values[:-1]
self.advantage_buffer[path_slice] = discounted_cumulative_sums(
deltas, self.gamma * self.lam
)
self.return_buffer[path_slice] = discounted_cumulative_sums(
rewards, self.gamma
)[:-1]
self.trajectory_start_index = self.pointer
def get(self):
# Get all data of the buffer and normalize the advantages
self.pointer, self.trajectory_start_index = 0, 0
advantage_mean, advantage_std = (
np.mean(self.advantage_buffer),
np.std(self.advantage_buffer),
)
self.advantage_buffer = (self.advantage_buffer - advantage_mean) / advantage_std
return (
self.observation_buffer,
self.action_buffer,
self.advantage_buffer,
self.return_buffer,
self.logprobability_buffer,
)
def mlp(x, sizes, activation=keras.activations.tanh, output_activation=None):
# Build a feedforward neural network
for size in sizes[:-1]:
x = layers.Dense(units=size, activation=activation)(x)
return layers.Dense(units=sizes[-1], activation=output_activation)(x)
def logprobabilities(logits, a):
# Compute the log-probabilities of taking actions a by using the logits (i.e. the output of the actor)
logprobabilities_all = keras.ops.log_softmax(logits)
logprobability = keras.ops.sum(
keras.ops.one_hot(a, num_actions) * logprobabilities_all, axis=1
)
return logprobability
seed_generator = keras.random.SeedGenerator(1337)
# Sample action from actor
@tf.function
def sample_action(observation):
logits = actor(observation)
action = keras.ops.squeeze(
keras.random.categorical(logits, 1, seed=seed_generator), axis=1
)
return logits, action
# Train the policy by maxizing the PPO-Clip objective
@tf.function
def train_policy(
observation_buffer, action_buffer, logprobability_buffer, advantage_buffer
):
with tf.GradientTape() as tape: # Record operations for automatic differentiation.
ratio = keras.ops.exp(
logprobabilities(actor(observation_buffer), action_buffer)
- logprobability_buffer
)
min_advantage = keras.ops.where(
advantage_buffer > 0,
(1 + clip_ratio) * advantage_buffer,
(1 - clip_ratio) * advantage_buffer,
)
policy_loss = -keras.ops.mean(
keras.ops.minimum(ratio * advantage_buffer, min_advantage)
)
policy_grads = tape.gradient(policy_loss, actor.trainable_variables)
policy_optimizer.apply_gradients(zip(policy_grads, actor.trainable_variables))
kl = keras.ops.mean(
logprobability_buffer
- logprobabilities(actor(observation_buffer), action_buffer)
)
kl = keras.ops.sum(kl)
return kl
# Train the value function by regression on mean-squared error
@tf.function
def train_value_function(observation_buffer, return_buffer):
with tf.GradientTape() as tape: # Record operations for automatic differentiation.
value_loss = keras.ops.mean((return_buffer - critic(observation_buffer)) ** 2)
value_grads = tape.gradient(value_loss, critic.trainable_variables)
value_optimizer.apply_gradients(zip(value_grads, critic.trainable_variables))
# Hyperparameters of the PPO algorithm
steps_per_epoch = 4000
epochs = 30
gamma = 0.99
clip_ratio = 0.2
policy_learning_rate = 3e-4
value_function_learning_rate = 1e-3
train_policy_iterations = 80
train_value_iterations = 80
lam = 0.97
target_kl = 0.01
hidden_sizes = (64, 64)
# True if you want to render the environment
render = False
# Initialize the environment and get the dimensionality of the
# observation space and the number of possible actions
env = gym.make("CartPole-v1")
observation_dimensions = env.observation_space.shape[0]
num_actions = env.action_space.n
# Initialize the buffer
buffer = Buffer(observation_dimensions, steps_per_epoch)
# Initialize the actor and the critic as keras models
observation_input = keras.Input(shape=(observation_dimensions,), dtype="float32")
logits = mlp(observation_input, list(hidden_sizes) + [num_actions])
actor = keras.Model(inputs=observation_input, outputs=logits)
value = keras.ops.squeeze(mlp(observation_input, list(hidden_sizes) + [1]), axis=1)
critic = keras.Model(inputs=observation_input, outputs=value)
# Initialize the policy and the value function optimizers
policy_optimizer = keras.optimizers.Adam(learning_rate=policy_learning_rate)
value_optimizer = keras.optimizers.Adam(learning_rate=value_function_learning_rate)
# Initialize the observation, episode return and episode length
observation, _ = env.reset()
episode_return, episode_length = 0, 0
# Iterate over the number of epochs
for epoch in range(epochs):
# Initialize the sum of the returns, lengths and number of episodes for each epoch
sum_return = 0
sum_length = 0
num_episodes = 0
# Iterate over the steps of each epoch
for t in range(steps_per_epoch):
if render:
env.render()
# Get the logits, action, and take one step in the environment
observation = observation.reshape(1, -1)
logits, action = sample_action(observation)
observation_new, reward, done, _, _ = env.step(action[0].numpy())
episode_return += reward
episode_length += 1
# Get the value and log-probability of the action
value_t = critic(observation)
logprobability_t = logprobabilities(logits, action)
# Store obs, act, rew, v_t, logp_pi_t
buffer.store(observation, action, reward, value_t, logprobability_t)
# Update the observation
observation = observation_new
# Finish trajectory if reached to a terminal state
terminal = done
if terminal or (t == steps_per_epoch - 1):
last_value = 0 if done else critic(observation.reshape(1, -1))
buffer.finish_trajectory(last_value)
sum_return += episode_return
sum_length += episode_length
num_episodes += 1
observation, _ = env.reset()
episode_return, episode_length = 0, 0
# Get values from the buffer
(
observation_buffer,
action_buffer,
advantage_buffer,
return_buffer,
logprobability_buffer,
) = buffer.get()
# Update the policy and implement early stopping using KL divergence
for _ in range(train_policy_iterations):
kl = train_policy(
observation_buffer, action_buffer, logprobability_buffer, advantage_buffer
)
if kl > 1.5 * target_kl:
# Early Stopping
break
# Update the value function
for _ in range(train_value_iterations):
train_value_function(observation_buffer, return_buffer)
# Print mean return and length for each epoch
print(
f" Epoch: {epoch + 1}. Mean Return: {sum_return / num_episodes}. Mean Length: {sum_length / num_episodes}"
)
Epoch: 1. Mean Return: 20.512820512820515. Mean Length: 20.512820512820515
Epoch: 2. Mean Return: 24.84472049689441. Mean Length: 24.84472049689441
Epoch: 3. Mean Return: 33.333333333333336. Mean Length: 33.333333333333336
Epoch: 4. Mean Return: 38.46153846153846. Mean Length: 38.46153846153846
Epoch: 5. Mean Return: 59.701492537313435. Mean Length: 59.701492537313435
Epoch: 6. Mean Return: 80.0. Mean Length: 80.0
Epoch: 7. Mean Return: 111.11111111111111. Mean Length: 111.11111111111111
Epoch: 8. Mean Return: 200.0. Mean Length: 200.0
Epoch: 9. Mean Return: 266.6666666666667. Mean Length: 266.6666666666667
Epoch: 10. Mean Return: 444.44444444444446. Mean Length: 444.44444444444446
Epoch: 11. Mean Return: 400.0. Mean Length: 400.0
Epoch: 12. Mean Return: 1000.0. Mean Length: 1000.0
Epoch: 13. Mean Return: 2000.0. Mean Length: 2000.0
Epoch: 14. Mean Return: 444.44444444444446. Mean Length: 444.44444444444446
Epoch: 15. Mean Return: 2000.0. Mean Length: 2000.0
Epoch: 16. Mean Return: 4000.0. Mean Length: 4000.0
Epoch: 17. Mean Return: 2000.0. Mean Length: 2000.0
Epoch: 18. Mean Return: 4000.0. Mean Length: 4000.0
Epoch: 19. Mean Return: 4000.0. Mean Length: 4000.0
Epoch: 20. Mean Return: 2000.0. Mean Length: 2000.0
Epoch: 21. Mean Return: 2000.0. Mean Length: 2000.0
Epoch: 22. Mean Return: 4000.0. Mean Length: 4000.0
Epoch: 23. Mean Return: 4000.0. Mean Length: 4000.0
Epoch: 24. Mean Return: 4000.0. Mean Length: 4000.0
Epoch: 25. Mean Return: 4000.0. Mean Length: 4000.0
Epoch: 26. Mean Return: 4000.0. Mean Length: 4000.0
Epoch: 27. Mean Return: 4000.0. Mean Length: 4000.0
Epoch: 28. Mean Return: 4000.0. Mean Length: 4000.0
Epoch: 29. Mean Return: 4000.0. Mean Length: 4000.0
Epoch: 30. Mean Return: 4000.0. Mean Length: 4000.0
训练前
训练 8 个 epoch 后
训练 20 个 epoch 后