Sto tentando di implementare Asynchronous Methods for Deep Reinforcement Learning e uno dei passaggi richiede di accumulare il gradiente su diversi passaggi e quindi applicarlo. Qual è il modo migliore per ottenere questo in tensorflow? Sono arrivato così lontano da accumulare il gradiente e non credo sia il modo più veloce per raggiungerlo (molti trasferimenti da tensorflow a python e viceversa). Qualsiasi suggerimento è benvenuto. Questo è il mio codice di un giocattolo NN. Non modella o calcola nulla, semplicemente esercita le operazioni che voglio usare.Come accumulare e appy gradienti per l'aggiornamento DQNetwork Async n-step in Tensorflow?
import tensorflow as tf
from model import *
graph = tf.Graph()
with graph.as_default():
state = tf.placeholder(tf.float32, shape=[None, 80,80,1])
with tf.variable_scope('layer1'):
W = weight_variable([8, 8, 1, 32])
variable_summaries(W, "layer1/W")
b = bias_variable([32])
variable_summaries(b, "layer1/b")
h = conv2d(state, W, 4) + b
activation = tf.nn.relu(h)
pool1 = max_pool_2x2(activation)
print(pool1.get_shape())
pool1 = tf.reshape(pool1, [-1, 3200])
with tf.variable_scope('readout'):
W = weight_variable([3200, 3])
b = bias_variable([3])
logits = tf.matmul(pool1, W) + b
variable_summaries(h, "y")
action_indexes = tf.placeholder(tf.int32, shape=[None], name="action_indexes")
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, action_indexes)
starter_learning_rate = 1e-6
global_step = tf.Variable(0, trainable=False)
# decay every 1000 steps with a base of 0.96:
learning_rate = tf.train.exponential_decay(starter_learning_rate,
global_step,
10000, 0.96, staircase=True)
optimizer = tf.train.RMSPropOptimizer(learning_rate)
gradients_and_variables = optimizer.compute_gradients(loss, tf.trainable_variables())
discounted_values = tf.placeholder(tf.float32, shape=[None, 1])
with tf.Session(graph=graph) as s:
for v in tf.trainable_variables():
print(v.name, v.dtype, v.get_shape())
s.run(tf.initialize_all_variables())
feed_dict= {
state : np.zeros([1, 80, 80, 1]),
action_indexes: [1],
}
var_to_grad = dict((var.name, grad) for grad, var in gradients_and_variables)
keys = sorted(var_to_grad.keys())
print(keys)
name_to_var = dict((var.name, var) for _, var in gradients_and_variables)
for i in range(10):
gradients = s.run([ var_to_grad[k] for k in keys], feed_dict=feed_dict)
for k,v in zip(keys, gradients):
var_to_grad[k] += v
for k in keys:
print(var_to_grad[k])
s.run(optimizer.apply_gradients((g, name_to_var[v]) for v,g in var_to_grad.iteritems()), feed_dict=feed_dict)
codice aggiornato dopo suggerimento @yaroslave:
import tensorflow as tf
from model import *
graph = tf.Graph()
with graph.as_default():
minibatch = 32
state = tf.placeholder(tf.float32, shape=[minibatch, 80,80,1], name="input")
with tf.variable_scope('layer1'):
W = weight_variable([8, 8, 1, 32])
variable_summaries(W, "layer1/W")
b = bias_variable([32])
variable_summaries(b, "layer1/b")
h = conv2d(state, W, 4) + b
activation = tf.nn.relu(h)
pool1 = max_pool_2x2(activation)
print(pool1.get_shape())
pool1 = tf.reshape(pool1, [-1, 3200])
with tf.variable_scope('readout'):
W = weight_variable([3200, 3])
b = bias_variable([3])
logits = tf.matmul(pool1, W) + b
variable_summaries(h, "y")
action_indexes = tf.placeholder(tf.int32, shape=[minibatch], name="action_indexes")
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, action_indexes)
starter_learning_rate = 1e-6
global_step = tf.Variable(0, trainable=False)
# decay every 1000 steps with a base of 0.96:
learning_rate = tf.train.exponential_decay(starter_learning_rate,
global_step,
10000, 0.96, staircase=True)
optimizer = tf.train.RMSPropOptimizer(learning_rate)
trainable_variables = tf.trainable_variables()
varname_to_var = dict((v.name, v) for v in trainable_variables)
keys = sorted(varname_to_var.keys())
gradients_and_variables = optimizer.compute_gradients(loss, [ varname_to_var[k] for k in keys])
var_to_grad = dict((var.name, grad) for grad, var in gradients_and_variables)
name_to_var = dict((var.name, var) for _, var in gradients_and_variables)
# save the gradients in memory
var_to_ref_grad = {}
for k in keys:
grad = var_to_grad[k]
print(k, grad.get_shape())
ref = tf.Variable(tf.zeros_like(grad))
ref = ref.assign_add(grad)
var_to_ref_grad[k] = ref
discounted_values = tf.placeholder(tf.float32, shape=[None, 1], name='discounted_values')
# control when to apply gradients
compute_gradients_flag = tf.placeholder(tf.int32, name="compute_gradients")
def fn1():
var_grad_list = []
for k in keys:
grad = var_to_ref_grad[k]
var = varname_to_var[k]
var_grad_list.append((grad,var))
optimizer.apply_gradients(var_grad_list)
return tf.no_op()
fn2 = lambda : tf.no_op()
last_op = tf.cond(tf.equal(compute_gradients_flag, 1), fn1, fn2)
with tf.Session(graph=graph) as s:
feed_dict= {
state : np.zeros([minibatch, 80, 80, 1]),
action_indexes: [1],
compute_gradients_flag: False,
}
s.run(tf.initialize_all_variables())
for i in range(10):
# accumulate gradients
s.run(last_op, feed_dict=feed_dict)
si potrebbe tenere tutto in TF eseguendo 'assegnare 'op che salvano i gradienti nelle variabili invece di recuperare i valori e poi eseguono' assign_add' invece di accumulare –
@YaroslavBulatov Pensi che TF aggiungerà più interfacce per implementazioni come questa? Sarebbe bello poter usare TF anche per l'apprendimento di rinforzo. –
@SungKim C'è un'interfaccia di esecuzione immediata [nei lavori] (https://github.com/tensorflow/tensorflow/pull/2595) in modo da poter utilizzare i costrutti standard di Python mantenendo i dati su GPU –