Je suis en train de construire un modèle de réseau neuronal récurrent simple en utilisant tensorflow sur Mac OS X. Il est juste un modèle de jouet et la taille des données d'entrée ne dépasse pas 3 Mo donc il ne devrait pas consomme beaucoup de mémoire. Cependant, lorsque j'exécute un modèle, l'utilisation de la mémoire augmente considérablement chaque lot d'entraînement et dépasse 10 Go. C'était pour seulement deux itérations. Je ne pouvais pas courir plus.Fuite de mémoire Tensorflow avec réseau neuronal récurrent
Voici l'intégralité du code.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import numpy as np
from pympler import summary
class RNN():
"""The RNN model."""
#@profile
def inference(self):
"""calculate outputs and loss for a single batch"""
total_loss = 0.0
outputs = []
for i in range(self.batch_size):
state = self.init_state
outputs.append([])
loss = 0.0
for j in range(self.num_steps):
state, output = self.next_state(self.x[i,j,:],state)
outputs[i].append(output)
loss += tf.square(self.y[i,j,:]-output)
total_loss+=loss
return outputs, total_loss/(self.batch_size*self.num_steps)
def __init__(self, is_training, config):
self.sess = sess = tf.Session()
self.prev_see = prev_see = config.prev_see
self.num_steps = num_steps = config.num_steps
#maybe "self.num_hidden =" part could be removed
self.num_hidden = num_hidden = config.num_hidden
self.batch_size = config.batch_size
self.epoch = config.epoch
self.learning_rate = config.learning_rate
self.summaries_dir = config.summaries_dir
with tf.name_scope('placeholders'):
self.x = tf.placeholder(tf.float32,[None,num_steps,config.prev_see],
name='input-x')
self.y = tf.placeholder(tf.float32, [None,num_steps,1],name='input-y')
default_init_state = tf.zeros([num_hidden])
self.init_state = tf.placeholder_with_default(default_init_state,[num_hidden],
name='state_placeholder')
def weight_variable(self,shape):
"""Create a weight variable with appropriate initialization."""
initial = tf.truncated_normal(shape,stddev=0.1)
return tf.Variable(initial)
def bias_variable(self,shape):
"""Create a bias variable with appropriate initialization."""
initial = tf.constant(0.1,shape=shape)
return tf.Variable(initial)
def variable_summaries(self,var,name):
"""Attach a lot of summaries to a Tensor."""
with tf.name_scope('summaries'):
mean = tf.reduce_mean(var)
tf.scalar_summary('mean/'+name,mean)
with tf.name_scope('stddev'):
stddev = tf.sqrt(tf.reduce_sum(tf.square(var-mean)))
tf.scalar_summary('stddev/'+name,stddev)
tf.scalar_summary('max/'+name, tf.reduce_max(var))
tf.scalar_summary('min/'+name, tf.reduce_min(var))
tf.histogram_summary(name, var)
#declare weight variables as property
layer_name = 'rnn_layer'
with tf.name_scope(layer_name):
with tf.name_scope('U'):
self.U = U = weight_variable(self,[prev_see,num_hidden])
variable_summaries(self,U,layer_name+'/U')
with tf.name_scope('W'):
self.W = W = weight_variable(self,[num_hidden,num_hidden])
variable_summaries(self,W,layer_name+'/W')
with tf.name_scope('b_W'):
self.b_W = b_W = bias_variable(self,[num_hidden])
variable_summaries(self,b_W,layer_name+'/b_W')
with tf.name_scope('V'):
self.V = V = weight_variable(self,[num_hidden,1])
variable_summaries(self,V,layer_name+'/V')
with tf.name_scope('b_V'):
self.b_V = b_V = bias_variable(self,[1])
variable_summaries(self,b_V,layer_name+'/b_V')
self.merged = tf.merge_all_summaries()
self.train_writer = tf.train.SummaryWriter(config.summaries_dir,sess.graph)
tf.initialize_all_variables().run(session=sess)
_,self.loss = self.inference()
def next_state(self,x,s_prev):
"""calculate next state and output"""
x = tf.reshape(x,[1,-1])
s_prev = tf.reshape(s_prev,[1,-1])
s_next = tf.tanh(tf.matmul(x,self.U)+tf.matmul(s_prev,self.W)+self.b_W)
output = tf.matmul(s_next,self.V)+self.b_V
return s_next, output
#@profile
def batch_train(self,feed_dict):
"""train the network for a single batch"""
loss = self.loss
train_step = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(loss)
summary,loss_value, _ = self.sess.run([self.merged,loss, train_step],feed_dict=feed_dict)
#self.train_writer.add_summary(summary)
print(loss_value)
class TrainConfig():
"""Train Config."""
total_steps = 245
test_ratio = 0.3
prev_see = 100
num_steps = int(round((total_steps-prev_see)*(1-test_ratio)))
num_hidden = 10
batch_size = 5
epoch = 3
learning_rate = 0.1
summaries_dir = '/Users/Kyungsu/StockPrediction/log'
class DebugConfig():
"""For debugging memory leak."""
total_steps = 100
test_ratio = 0.3
prev_see = 100
num_steps = 10
num_hidden = 10
batch_size = 5
epoch = 2
learning_rate = 0.1
summaries_dir = '/Users/Kyungsu/StockPrediction/log'
#@profile
def run_epoch(m,x_data,y_data):
num_batch = ((len(x_data)-1) // m.batch_size)+1
#num_batch = 100
for i in range(num_batch):
x_batch = x_data[i*m.batch_size:(i+1)*m.batch_size,:,:]
y_batch = y_data[i*m.batch_size:(i+1)*m.batch_size,:,:]
feed_dict = {m.x:x_batch,m.y:y_batch}
print("%dth/%dbatches"%(i+1,num_batch))
m.batch_train(feed_dict)
def process_data(data,config):
data_size = len(data)
prev_see = config.prev_see
num_steps = config.num_steps
x = np.zeros((data_size,num_steps,prev_see))
y = np.zeros((data_size,num_steps,1))
for i in range(data_size):
for j in range(num_steps-prev_see):
x[i,j,:] = data[i,i:i+prev_see]
y[i,j,0] = data[i,i+prev_see]
return x,y
#@profile
def main():
train_config = TrainConfig()
debug_config = DebugConfig()
data = np.load('processed_data.npy')
x,y = process_data(data,train_config)
rnn_model = RNN(True,train_config)
#training phase
for i in range(rnn_model.epoch):
print("%dth epoch"%(i+1))
run_epoch(rnn_model,x,y)
main()
Et voici le résultat de memory_profiler. Chose étrange, la majeure partie de la mémoire est allouée en pour la boucle. (Voir ligne 163, 135) Je suppose que cela signifie que la mémoire fuit.
Line # Mem usage Increment Line Contents
================================================
11 53.062 MiB 0.000 MiB @profile
12 def __init__(self, is_training, config):
13 53.875 MiB 0.812 MiB self.sess = sess = tf.Session()
14
15 53.875 MiB 0.000 MiB self.prev_see = prev_see = config.prev_see
16 53.875 MiB 0.000 MiB self.num_steps = num_steps = config.num_steps
17 #maybe "self.num_hidden =" part could be removed
18 53.875 MiB 0.000 MiB self.num_hidden = num_hidden = config.num_hidden
19 53.875 MiB 0.000 MiB self.batch_size = config.batch_size
20 53.875 MiB 0.000 MiB self.epoch = config.epoch
21 53.875 MiB 0.000 MiB self.learning_rate = config.learning_rate
22 53.875 MiB 0.000 MiB self.summaries_dir = config.summaries_dir
23
24 53.875 MiB 0.000 MiB with tf.name_scope('input'):
25 53.875 MiB 0.000 MiB self.x = tf.placeholder(tf.float32,[None,num_steps,config.prev_see],
26 53.957 MiB 0.082 MiB name='input-x')
27 53.973 MiB 0.016 MiB self.y = tf.placeholder(tf.float32, [None,num_steps,1],name='input-y')
28
29 55.316 MiB 1.344 MiB def weight_variable(self,shape):
30 """Create a weight variable with appropriate initialization."""
31 55.371 MiB 0.055 MiB initial = tf.truncated_normal(shape,stddev=0.1)
32 55.414 MiB 0.043 MiB return tf.Variable(initial)
33
34 55.707 MiB 0.293 MiB def bias_variable(self,shape):
35 """Create a bias variable with appropriate initialization."""
36 55.727 MiB 0.020 MiB initial = tf.constant(0.1,shape=shape)
37 55.754 MiB 0.027 MiB return tf.Variable(initial)
38
39 55.754 MiB 0.000 MiB def variable_summaries(self,var,name):
40 """Attach a lot of summaries to a Tensor."""
41 55.754 MiB 0.000 MiB with tf.name_scope('summaries'):
42 55.801 MiB 0.047 MiB mean = tf.reduce_mean(var)
43 55.824 MiB 0.023 MiB tf.scalar_summary('mean/'+name,mean)
44 55.824 MiB 0.000 MiB with tf.name_scope('stddev'):
45 55.883 MiB 0.059 MiB stddev = tf.sqrt(tf.reduce_sum(tf.square(var-mean)))
46 55.906 MiB 0.023 MiB tf.scalar_summary('stddev/'+name,stddev)
47 55.969 MiB 0.062 MiB tf.scalar_summary('max/'+name, tf.reduce_max(var))
48 56.027 MiB 0.059 MiB tf.scalar_summary('min/'+name, tf.reduce_min(var))
49 56.055 MiB 0.027 MiB tf.histogram_summary(name, var)
50
51 #declare weight variables as property
52 53.973 MiB -2.082 MiB layer_name = 'rnn_layer'
53 53.973 MiB 0.000 MiB with tf.name_scope(layer_name):
54 53.973 MiB 0.000 MiB with tf.name_scope('U'):
55 54.230 MiB 0.258 MiB self.U = U = weight_variable(self,[prev_see,num_hidden])
56 54.598 MiB 0.367 MiB variable_summaries(self,U,layer_name+'/U')
57 54.598 MiB 0.000 MiB with tf.name_scope('W'):
58 54.691 MiB 0.094 MiB self.W = W = weight_variable(self,[num_hidden,num_hidden])
59 54.961 MiB 0.270 MiB variable_summaries(self,W,layer_name+'/W')
60 54.961 MiB 0.000 MiB with tf.name_scope('b_W'):
61 55.012 MiB 0.051 MiB self.b_W = b_W = bias_variable(self,[num_hidden])
62 55.316 MiB 0.305 MiB variable_summaries(self,b_W,layer_name+'/b_W')
63 55.316 MiB 0.000 MiB with tf.name_scope('V'):
64 55.414 MiB 0.098 MiB self.V = V = weight_variable(self,[num_hidden,1])
65 55.707 MiB 0.293 MiB variable_summaries(self,V,layer_name+'/V')
66 55.707 MiB 0.000 MiB with tf.name_scope('b_V'):
67 55.754 MiB 0.047 MiB self.b_V = b_V = bias_variable(self,[1])
68 56.055 MiB 0.301 MiB variable_summaries(self,b_V,layer_name+'/b_V')
69 56.055 MiB 0.000 MiB self.merged = tf.merge_all_summaries()
70 60.348 MiB 4.293 MiB self.train_writer = tf.train.SummaryWriter(config.summaries_dir,sess.graph)
71 62.496 MiB 2.148 MiB tf.initialize_all_variables().run(session=sess)
Filename: rnn.py
Line # Mem usage Increment Line Contents
================================================
82 3013.336 MiB 0.000 MiB @profile
83 def inference(self):
84 """calculate outputs and loss for a single batch"""
85 3013.336 MiB 0.000 MiB total_loss = 0.0
86 3013.336 MiB 0.000 MiB outputs = []
87 3022.352 MiB 9.016 MiB for i in range(self.batch_size):
88 3020.441 MiB -1.910 MiB state = tf.zeros([self.num_hidden])
89 3020.441 MiB 0.000 MiB outputs.append([])
90 3020.441 MiB 0.000 MiB loss = 0.0
91 3022.348 MiB 1.906 MiB for j in range(self.num_steps):
92 3022.285 MiB -0.062 MiB state, output = self.next_state(self.x[i,j,:],state)
93 3022.285 MiB 0.000 MiB outputs[i].append(output)
94 3022.348 MiB 0.062 MiB loss += tf.square(self.y[i,j,:]-output)
95 3022.352 MiB 0.004 MiB total_loss+=loss
96 3022.371 MiB 0.020 MiB return outputs, total_loss/(self.batch_size*self.num_steps)
Filename: rnn.py
Line # Mem usage Increment Line Contents
================================================
97 3013.336 MiB 0.000 MiB @profile
98 def batch_train(self,feed_dict):
99 """train the network for a single batch"""
100 3022.371 MiB 9.035 MiB _, loss = self.inference()
101 3051.781 MiB 29.410 MiB train_step = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(loss)
102 3149.891 MiB 98.109 MiB summary,loss_value, _ = self.sess.run([self.merged,loss, train_step],feed_dict=feed_dict)
103 #self.train_writer.add_summary(summary)
104 3149.891 MiB 0.000 MiB print(loss_value)
Filename: rnn.py
Line # Mem usage Increment Line Contents
================================================
131 1582.758 MiB 0.000 MiB @profile
132 def run_epoch(m,x_data,y_data):
133 1582.758 MiB 0.000 MiB num_batch = ((len(x_data)-1) // m.batch_size)+1
134 #num_batch = 100
135 3149.895 MiB 1567.137 MiB for i in range(num_batch):
136 3013.336 MiB -136.559 MiB x_batch = x_data[i*m.batch_size:(i+1)*m.batch_size,:,:]
137 3013.336 MiB 0.000 MiB y_batch = y_data[i*m.batch_size:(i+1)*m.batch_size,:,:]
138 3013.336 MiB 0.000 MiB feed_dict = {m.x:x_batch,m.y:y_batch}
139 3013.336 MiB 0.000 MiB print("%dth/%dbatches"%(i+1,num_batch))
140 3149.891 MiB 136.555 MiB m.batch_train(feed_dict)
Filename: rnn.py
Line # Mem usage Increment Line Contents
================================================
154 52.914 MiB 0.000 MiB @profile
155 def main():
156 52.914 MiB 0.000 MiB train_config = TrainConfig()
157 52.914 MiB 0.000 MiB debug_config = DebugConfig()
158 53.059 MiB 0.145 MiB data = np.load('processed_data.npy')
159 53.062 MiB 0.004 MiB x,y = process_data(data,debug_config)
160 62.496 MiB 9.434 MiB rnn_model = RNN(True,debug_config)
161
162 #training phase
163 3149.898 MiB 3087.402 MiB for i in range(rnn_model.epoch):
164 1582.758 MiB -1567.141 MiB print("%dth epoch"%(i+1))
165 3149.898 MiB 1567.141 MiB run_epoch(rnn_model,x,y)
Ce problème n'a pas été produite lors j'ai essayé simplement MNIST model de tutoriel tensorflow. Cela devrait donc être lié au modèle RNN. En outre, je pourrais reproduire ce problème sur Ubuntu 14.04, donc je ne pense pas que ce problème est causé par des choses OS X. Merci d'avoir lu.
Cela a fonctionné pour moi merci! –