1

Je suis en train de construire un modèle de réseau neuronal récurrent simple en utilisant tensorflow sur Mac OS X. Il est juste un modèle de jouet et la taille des données d'entrée ne dépasse pas 3 Mo donc il ne devrait pas consomme beaucoup de mémoire. Cependant, lorsque j'exécute un modèle, l'utilisation de la mémoire augmente considérablement chaque lot d'entraînement et dépasse 10 Go. C'était pour seulement deux itérations. Je ne pouvais pas courir plus.Fuite de mémoire Tensorflow avec réseau neuronal récurrent

Voici l'intégralité du code.

from __future__ import absolute_import 
from __future__ import division 
from __future__ import print_function 

import tensorflow as tf 
import numpy as np 
from pympler import summary 

class RNN(): 
    """The RNN model.""" 
    #@profile 
    def inference(self): 
     """calculate outputs and loss for a single batch""" 
     total_loss = 0.0 
     outputs = [] 
     for i in range(self.batch_size): 
      state = self.init_state 
      outputs.append([]) 
      loss = 0.0 
      for j in range(self.num_steps): 
       state, output = self.next_state(self.x[i,j,:],state) 
       outputs[i].append(output) 
       loss += tf.square(self.y[i,j,:]-output) 
      total_loss+=loss 
     return outputs, total_loss/(self.batch_size*self.num_steps) 

    def __init__(self, is_training, config): 
     self.sess = sess = tf.Session() 

     self.prev_see = prev_see = config.prev_see 
     self.num_steps = num_steps = config.num_steps 
     #maybe "self.num_hidden =" part could be removed 
     self.num_hidden = num_hidden = config.num_hidden 
     self.batch_size = config.batch_size 
     self.epoch = config.epoch 
     self.learning_rate = config.learning_rate 
     self.summaries_dir = config.summaries_dir 

     with tf.name_scope('placeholders'): 
      self.x = tf.placeholder(tf.float32,[None,num_steps,config.prev_see], 
            name='input-x') 
      self.y = tf.placeholder(tf.float32, [None,num_steps,1],name='input-y') 
      default_init_state = tf.zeros([num_hidden]) 
      self.init_state = tf.placeholder_with_default(default_init_state,[num_hidden], 
                name='state_placeholder') 

     def weight_variable(self,shape): 
      """Create a weight variable with appropriate initialization.""" 
      initial = tf.truncated_normal(shape,stddev=0.1) 
      return tf.Variable(initial) 

     def bias_variable(self,shape): 
      """Create a bias variable with appropriate initialization.""" 
      initial = tf.constant(0.1,shape=shape) 
      return tf.Variable(initial) 

     def variable_summaries(self,var,name): 
      """Attach a lot of summaries to a Tensor.""" 
      with tf.name_scope('summaries'): 
       mean = tf.reduce_mean(var) 
       tf.scalar_summary('mean/'+name,mean) 
       with tf.name_scope('stddev'): 
        stddev = tf.sqrt(tf.reduce_sum(tf.square(var-mean))) 
       tf.scalar_summary('stddev/'+name,stddev) 
       tf.scalar_summary('max/'+name, tf.reduce_max(var)) 
       tf.scalar_summary('min/'+name, tf.reduce_min(var)) 
       tf.histogram_summary(name, var) 

     #declare weight variables as property 
     layer_name = 'rnn_layer' 
     with tf.name_scope(layer_name): 
      with tf.name_scope('U'): 
       self.U = U = weight_variable(self,[prev_see,num_hidden]) 
       variable_summaries(self,U,layer_name+'/U') 
      with tf.name_scope('W'): 
       self.W = W = weight_variable(self,[num_hidden,num_hidden]) 
       variable_summaries(self,W,layer_name+'/W') 
      with tf.name_scope('b_W'): 
       self.b_W = b_W = bias_variable(self,[num_hidden]) 
       variable_summaries(self,b_W,layer_name+'/b_W') 
      with tf.name_scope('V'): 
       self.V = V = weight_variable(self,[num_hidden,1]) 
       variable_summaries(self,V,layer_name+'/V') 
      with tf.name_scope('b_V'): 
       self.b_V = b_V = bias_variable(self,[1]) 
       variable_summaries(self,b_V,layer_name+'/b_V') 
     self.merged = tf.merge_all_summaries() 
     self.train_writer = tf.train.SummaryWriter(config.summaries_dir,sess.graph) 
     tf.initialize_all_variables().run(session=sess) 
     _,self.loss = self.inference() 


    def next_state(self,x,s_prev): 
     """calculate next state and output""" 
     x = tf.reshape(x,[1,-1]) 
     s_prev = tf.reshape(s_prev,[1,-1])   
     s_next = tf.tanh(tf.matmul(x,self.U)+tf.matmul(s_prev,self.W)+self.b_W) 
     output = tf.matmul(s_next,self.V)+self.b_V 
     return s_next, output 

    #@profile 
    def batch_train(self,feed_dict): 
     """train the network for a single batch""" 
     loss = self.loss 
     train_step = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(loss) 
     summary,loss_value, _ = self.sess.run([self.merged,loss, train_step],feed_dict=feed_dict) 
     #self.train_writer.add_summary(summary) 
     print(loss_value) 


class TrainConfig(): 
    """Train Config.""" 
    total_steps = 245 
    test_ratio = 0.3 
    prev_see = 100 
    num_steps = int(round((total_steps-prev_see)*(1-test_ratio))) 
    num_hidden = 10 
    batch_size = 5 
    epoch = 3 
    learning_rate = 0.1 
    summaries_dir = '/Users/Kyungsu/StockPrediction/log' 

class DebugConfig(): 
    """For debugging memory leak.""" 
    total_steps = 100 
    test_ratio = 0.3 
    prev_see = 100 
    num_steps = 10 
    num_hidden = 10 
    batch_size = 5 
    epoch = 2 
    learning_rate = 0.1 
    summaries_dir = '/Users/Kyungsu/StockPrediction/log' 

#@profile 
def run_epoch(m,x_data,y_data): 
    num_batch = ((len(x_data)-1) // m.batch_size)+1 
    #num_batch = 100 
    for i in range(num_batch): 
     x_batch = x_data[i*m.batch_size:(i+1)*m.batch_size,:,:] 
     y_batch = y_data[i*m.batch_size:(i+1)*m.batch_size,:,:] 
     feed_dict = {m.x:x_batch,m.y:y_batch} 
     print("%dth/%dbatches"%(i+1,num_batch)) 
     m.batch_train(feed_dict) 

def process_data(data,config): 
    data_size = len(data) 
    prev_see = config.prev_see 
    num_steps = config.num_steps 
    x = np.zeros((data_size,num_steps,prev_see)) 
    y = np.zeros((data_size,num_steps,1)) 
    for i in range(data_size): 
     for j in range(num_steps-prev_see): 
      x[i,j,:] = data[i,i:i+prev_see] 
      y[i,j,0] = data[i,i+prev_see] 
    return x,y 

#@profile 
def main(): 
    train_config = TrainConfig() 
    debug_config = DebugConfig() 
    data = np.load('processed_data.npy') 
    x,y = process_data(data,train_config) 
    rnn_model = RNN(True,train_config) 

    #training phase 
    for i in range(rnn_model.epoch): 
     print("%dth epoch"%(i+1)) 
     run_epoch(rnn_model,x,y) 

main() 

Et voici le résultat de memory_profiler. Chose étrange, la majeure partie de la mémoire est allouée en pour la boucle. (Voir ligne 163, 135) Je suppose que cela signifie que la mémoire fuit.

Line # Mem usage Increment Line Contents 
================================================ 
    11 53.062 MiB 0.000 MiB  @profile 
    12         def __init__(self, is_training, config): 
    13 53.875 MiB 0.812 MiB   self.sess = sess = tf.Session() 
    14          
    15 53.875 MiB 0.000 MiB   self.prev_see = prev_see = config.prev_see 
    16 53.875 MiB 0.000 MiB   self.num_steps = num_steps = config.num_steps 
    17          #maybe "self.num_hidden =" part could be removed 
    18 53.875 MiB 0.000 MiB   self.num_hidden = num_hidden = config.num_hidden 
    19 53.875 MiB 0.000 MiB   self.batch_size = config.batch_size 
    20 53.875 MiB 0.000 MiB   self.epoch = config.epoch 
    21 53.875 MiB 0.000 MiB   self.learning_rate = config.learning_rate 
    22 53.875 MiB 0.000 MiB   self.summaries_dir = config.summaries_dir 
    23        
    24 53.875 MiB 0.000 MiB   with tf.name_scope('input'): 
    25 53.875 MiB 0.000 MiB    self.x = tf.placeholder(tf.float32,[None,num_steps,config.prev_see], 
    26 53.957 MiB 0.082 MiB          name='input-x') 
    27 53.973 MiB 0.016 MiB    self.y = tf.placeholder(tf.float32, [None,num_steps,1],name='input-y') 
    28          
    29 55.316 MiB 1.344 MiB   def weight_variable(self,shape): 
    30           """Create a weight variable with appropriate initialization.""" 
    31 55.371 MiB 0.055 MiB    initial = tf.truncated_normal(shape,stddev=0.1) 
    32 55.414 MiB 0.043 MiB    return tf.Variable(initial) 
    33        
    34 55.707 MiB 0.293 MiB   def bias_variable(self,shape): 
    35           """Create a bias variable with appropriate initialization.""" 
    36 55.727 MiB 0.020 MiB    initial = tf.constant(0.1,shape=shape) 
    37 55.754 MiB 0.027 MiB    return tf.Variable(initial) 
    38          
    39 55.754 MiB 0.000 MiB   def variable_summaries(self,var,name): 
    40           """Attach a lot of summaries to a Tensor.""" 
    41 55.754 MiB 0.000 MiB    with tf.name_scope('summaries'): 
    42 55.801 MiB 0.047 MiB     mean = tf.reduce_mean(var) 
    43 55.824 MiB 0.023 MiB     tf.scalar_summary('mean/'+name,mean) 
    44 55.824 MiB 0.000 MiB     with tf.name_scope('stddev'): 
    45 55.883 MiB 0.059 MiB      stddev = tf.sqrt(tf.reduce_sum(tf.square(var-mean))) 
    46 55.906 MiB 0.023 MiB     tf.scalar_summary('stddev/'+name,stddev) 
    47 55.969 MiB 0.062 MiB     tf.scalar_summary('max/'+name, tf.reduce_max(var)) 
    48 56.027 MiB 0.059 MiB     tf.scalar_summary('min/'+name, tf.reduce_min(var)) 
    49 56.055 MiB 0.027 MiB     tf.histogram_summary(name, var) 
    50          
    51          #declare weight variables as property 
    52 53.973 MiB -2.082 MiB   layer_name = 'rnn_layer' 
    53 53.973 MiB 0.000 MiB   with tf.name_scope(layer_name): 
    54 53.973 MiB 0.000 MiB    with tf.name_scope('U'): 
    55 54.230 MiB 0.258 MiB     self.U = U = weight_variable(self,[prev_see,num_hidden]) 
    56 54.598 MiB 0.367 MiB     variable_summaries(self,U,layer_name+'/U') 
    57 54.598 MiB 0.000 MiB    with tf.name_scope('W'): 
    58 54.691 MiB 0.094 MiB     self.W = W = weight_variable(self,[num_hidden,num_hidden]) 
    59 54.961 MiB 0.270 MiB     variable_summaries(self,W,layer_name+'/W') 
    60 54.961 MiB 0.000 MiB    with tf.name_scope('b_W'): 
    61 55.012 MiB 0.051 MiB     self.b_W = b_W = bias_variable(self,[num_hidden]) 
    62 55.316 MiB 0.305 MiB     variable_summaries(self,b_W,layer_name+'/b_W') 
    63 55.316 MiB 0.000 MiB    with tf.name_scope('V'): 
    64 55.414 MiB 0.098 MiB     self.V = V = weight_variable(self,[num_hidden,1]) 
    65 55.707 MiB 0.293 MiB     variable_summaries(self,V,layer_name+'/V') 
    66 55.707 MiB 0.000 MiB    with tf.name_scope('b_V'): 
    67 55.754 MiB 0.047 MiB     self.b_V = b_V = bias_variable(self,[1]) 
    68 56.055 MiB 0.301 MiB     variable_summaries(self,b_V,layer_name+'/b_V') 
    69 56.055 MiB 0.000 MiB   self.merged = tf.merge_all_summaries() 
    70 60.348 MiB 4.293 MiB   self.train_writer = tf.train.SummaryWriter(config.summaries_dir,sess.graph) 
    71 62.496 MiB 2.148 MiB   tf.initialize_all_variables().run(session=sess) 


Filename: rnn.py 

Line # Mem usage Increment Line Contents 
================================================ 
    82 3013.336 MiB 0.000 MiB  @profile 
    83         def inference(self): 
    84          """calculate outputs and loss for a single batch""" 
    85 3013.336 MiB 0.000 MiB   total_loss = 0.0 
    86 3013.336 MiB 0.000 MiB   outputs = [] 
    87 3022.352 MiB 9.016 MiB   for i in range(self.batch_size): 
    88 3020.441 MiB -1.910 MiB    state = tf.zeros([self.num_hidden]) 
    89 3020.441 MiB 0.000 MiB    outputs.append([]) 
    90 3020.441 MiB 0.000 MiB    loss = 0.0 
    91 3022.348 MiB 1.906 MiB    for j in range(self.num_steps): 
    92 3022.285 MiB -0.062 MiB     state, output = self.next_state(self.x[i,j,:],state) 
    93 3022.285 MiB 0.000 MiB     outputs[i].append(output) 
    94 3022.348 MiB 0.062 MiB     loss += tf.square(self.y[i,j,:]-output) 
    95 3022.352 MiB 0.004 MiB    total_loss+=loss 
    96 3022.371 MiB 0.020 MiB   return outputs, total_loss/(self.batch_size*self.num_steps) 


Filename: rnn.py 

Line # Mem usage Increment Line Contents 
================================================ 
    97 3013.336 MiB 0.000 MiB  @profile 
    98         def batch_train(self,feed_dict): 
    99          """train the network for a single batch""" 
    100 3022.371 MiB 9.035 MiB   _, loss = self.inference() 
    101 3051.781 MiB 29.410 MiB   train_step = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(loss) 
    102 3149.891 MiB 98.109 MiB   summary,loss_value, _ = self.sess.run([self.merged,loss, train_step],feed_dict=feed_dict) 
    103          #self.train_writer.add_summary(summary) 
    104 3149.891 MiB 0.000 MiB   print(loss_value) 


Filename: rnn.py 

Line # Mem usage Increment Line Contents 
================================================ 
    131 1582.758 MiB 0.000 MiB @profile 
    132        def run_epoch(m,x_data,y_data): 
    133 1582.758 MiB 0.000 MiB  num_batch = ((len(x_data)-1) // m.batch_size)+1 
    134         #num_batch = 100 
    135 3149.895 MiB 1567.137 MiB  for i in range(num_batch): 
    136 3013.336 MiB -136.559 MiB   x_batch = x_data[i*m.batch_size:(i+1)*m.batch_size,:,:] 
    137 3013.336 MiB 0.000 MiB   y_batch = y_data[i*m.batch_size:(i+1)*m.batch_size,:,:] 
    138 3013.336 MiB 0.000 MiB   feed_dict = {m.x:x_batch,m.y:y_batch} 
    139 3013.336 MiB 0.000 MiB   print("%dth/%dbatches"%(i+1,num_batch)) 
    140 3149.891 MiB 136.555 MiB   m.batch_train(feed_dict) 


Filename: rnn.py 

Line # Mem usage Increment Line Contents 
================================================ 
    154 52.914 MiB 0.000 MiB @profile 
    155        def main(): 
    156 52.914 MiB 0.000 MiB  train_config = TrainConfig() 
    157 52.914 MiB 0.000 MiB  debug_config = DebugConfig() 
    158 53.059 MiB 0.145 MiB  data = np.load('processed_data.npy') 
    159 53.062 MiB 0.004 MiB  x,y = process_data(data,debug_config) 
    160 62.496 MiB 9.434 MiB  rnn_model = RNN(True,debug_config) 
    161        
    162         #training phase 
    163 3149.898 MiB 3087.402 MiB  for i in range(rnn_model.epoch): 
    164 1582.758 MiB -1567.141 MiB   print("%dth epoch"%(i+1)) 
    165 3149.898 MiB 1567.141 MiB   run_epoch(rnn_model,x,y) 

Ce problème n'a pas été produite lors j'ai essayé simplement MNIST model de tutoriel tensorflow. Cela devrait donc être lié au modèle RNN. En outre, je pourrais reproduire ce problème sur Ubuntu 14.04, donc je ne pense pas que ce problème est causé par des choses OS X. Merci d'avoir lu.

Répondre

2

Je pense que le problème est que cette ligne

train_step = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(loss) 

se produit dans votre fonction batch_train, donc à chaque itération, une nouvelle GradientDescentOptimizer est créé. Essayez de déplacer ceci vers la fonction init de votre modèle juste après avoir défini la perte et faites référence à self.train_step dans votre fonction batch_train à la place.

+0

Cela a fonctionné pour moi merci! –