Запустите обучение с подкреплением тензорного потока в Машинном обучении Azure.
Предварительное требование
- Учетная запись Azure
- Машинное обучение Azure Azure
- Хранилище Azure
- Нужен Tensorflow › 2.2.0
Шаги
- Создать новый компьютер
- Создать новый блокнот
- я использую образец tensorflow rl из colab
pip install --upgrade tensorflow
!sudo apt-get update !sudo apt-get install -y xvfb ffmpeg !pip install 'imageio==2.4.0' !pip install pyvirtualdisplay !pip install tf-agents
включить библиотеки
from __future__ import absolute_import, division, print_function
import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import PIL.Image
import pyvirtualdisplay
import tensorflow as tf
- распечатать тензорный поток
import tensorflow as tf
print(tf.__version__)
- включить агентов tf
from tf_agents.agents.dqn import dqn_agent
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import sequential
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.specs import tensor_spec
from tf_agents.utils import common
- Установить отображение
# Set up a virtual display for rendering OpenAI gym environments.
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()
tf.version.VERSION
- у меня версия 2.5.0
- Гипер параметры
num_iterations = 20000 # @param {type:"integer"}
initial_collect_steps = 100 # @param {type:"integer"}
collect_steps_per_iteration = 1 # @param {type:"integer"}
replay_buffer_max_length = 100000 # @param {type:"integer"}
batch_size = 64 # @param {type:"integer"}
learning_rate = 1e-3 # @param {type:"number"}
log_interval = 200 # @param {type:"integer"}
num_eval_episodes = 10 # @param {type:"integer"}
eval_interval = 1000 # @param {type:"integer"}
- Окружающая обстановка
env_name = 'CartPole-v0' env = suite_gym.load(env_name)
#@test {"skip": true} env.reset() PIL.Image.fromarray(env.render())
Вывод
- Настройка агента Tensorflow
print('Reward Spec:') print(env.time_step_spec().reward)
print('Action Spec:') print(env.action_spec())
time_step = env.reset() print('Time step:') print(time_step) action = np.array(1, dtype=np.int32) next_time_step = env.step(action) print('Next time step:') print(next_time_step)
train_py_env = suite_gym.load(env_name) eval_py_env = suite_gym.load(env_name)
train_env = tf_py_environment.TFPyEnvironment(train_py_env) eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)
- Оптимизатор
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
train_step_counter = tf.Variable(0)
agent = dqn_agent.DqnAgent(
train_env.time_step_spec(),
train_env.action_spec(),
q_network=q_net,
optimizer=optimizer,
td_errors_loss_fn=common.element_wise_squared_loss,
train_step_counter=train_step_counter)
agent.initialize()
- Политика
eval_policy = agent.policy collect_policy = agent.collect_policy
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec())
example_environment = tf_py_environment.TFPyEnvironment( suite_gym.load('CartPole-v0'))
time_step = example_environment.reset()
random_policy.action(time_step)
- Метрики и оценка
#@test {"skip": true} def compute_avg_return(environment, policy, num_episodes=10): total_return = 0.0 for _ in range(num_episodes): time_step = environment.reset() episode_return = 0.0 while not time_step.is_last(): action_step = policy.action(time_step) time_step = environment.step(action_step.action) episode_return += time_step.reward total_return += episode_return avg_return = total_return / num_episodes return avg_return.numpy()[0] # See also the metrics module for standard implementations of different metrics. # https://github.com/tensorflow/agents/tree/master/tf_agents/metrics
compute_avg_return(eval_env, random_policy, num_eval_episodes)
- Буфер воспроизведения
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=train_env.batch_size, max_length=replay_buffer_max_length)
agent.collect_data_spec
agent.collect_data_spec._fields
- Сбор данных
#@test {"skip": true} def collect_step(environment, policy, buffer): time_step = environment.current_time_step() action_step = policy.action(time_step) next_time_step = environment.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) # Add trajectory to the replay buffer buffer.add_batch(traj) def collect_data(env, policy, buffer, steps): for _ in range(steps): collect_step(env, policy, buffer) collect_data(train_env, random_policy, replay_buffer, initial_collect_steps) # This loop is so common in RL, that we provide standard implementations. # For more details see tutorial 4 or the drivers module. # https://github.com/tensorflow/agents/blob/master/docs/tutorials/4_drivers_tutorial.ipynb # https://www.tensorflow.org/agents/api_docs/python/tf_agents/drivers
# Dataset generates trajectories with shape [Bx2x...] dataset = replay_buffer.as_dataset( num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3) dataset
iterator = iter(dataset) print(iterator)
- Обучение агента
iterator = iter(dataset)
print(iterator)
- Визуализация
#@test {"skip": true}
iterations = range(0, num_iterations + 1, eval_interval)
plt.plot(iterations, returns)
plt.ylabel('Average Return')
plt.xlabel('Iterations')
plt.ylim(top=250)
- Видео
def embed_mp4(filename): """Embeds an mp4 file in the notebook.""" video = open(filename,'rb').read() b64 = base64.b64encode(video) tag = ''' <video width="640" height="480" controls> <source src="data:video/mp4;base64,{0}" type="video/mp4"> Your browser does not support the video tag. </video>'''.format(b64.decode()) return IPython.display.HTML(tag)
def create_policy_eval_video(policy, filename, num_episodes=5, fps=30): filename = filename + ".mp4" with imageio.get_writer(filename, fps=fps) as video: for _ in range(num_episodes): time_step = eval_env.reset() video.append_data(eval_py_env.render()) while not time_step.is_last(): action_step = policy.action(time_step) time_step = eval_env.step(action_step.action) video.append_data(eval_py_env.render()) return embed_mp4(filename) create_policy_eval_video(agent.policy, "trained-agent")
- Изображение
create_policy_eval_video(random_policy, "random-agent")
Первоначально опубликовано на https://github.com.