2017-09-11 7 views
0

J'ai formé un modèle en utilisant Yelp-Data-Challenge données ont un fichier de conserves au vinaigre 399850by50reviews_words_index.pkl mais j'ai stucked comment ce fichier cornichon est utilisé pour tester les nouvelles données dans kerasessai pour les nouvelles prévisions de données

C'est mon code pour la formation des données et l'enregistrement dans la création de modèles

Comment utiliser ce modèle pour les données de test

J'utilise ici Keras avec Théano 1.0.0

''' 
train cnn mode for sentiment classification on yelp data set 
author: hao peng 
''' 
import pickle 
import pandas as pd 
import numpy as np 
from sklearn.cross_validation import train_test_split 
from Word2VecUtility import Word2VecUtility 
from keras.preprocessing import sequence 
from keras.models import Sequential 
from keras.layers.core import Dense, Dropout, Activation, Flatten 
from keras.layers.embeddings import Embedding 
from keras.layers.convolutional import Convolution1D, MaxPooling1D 


def get_volcabulary_and_list_words(data): 
    reviews_words = [] 
    volcabulary = [] 
    for review in data["text"]: 
     review_words = Word2VecUtility.review_to_wordlist(
      review, remove_stopwords=True) 
     reviews_words.append(review_words) 
     for word in review_words: 
      volcabulary.append(word) 
    volcabulary = set(volcabulary) 
    return volcabulary, reviews_words 

def get_reviews_word_index(reviews_words, volcabulary, max_words, max_length): 
    word2index = {word: i for i, word in enumerate(volcabulary)} 
    # use w in volcabulary to limit index within max_words 
    reviews_words_index = [[start] + [(word2index[w] + index_from) for w in review] for review in reviews_words] 
    # in word2vec embedding, use (i < max_words + index_from) because we need the exact index for each word, in order to map it to its vector. And then its max_words is 5003 instead of 5000. 
    reviews_words_index = [[i if (i < max_words) else oov for i in index] for index in reviews_words_index] 
    # padding with 0, each review has max_length now. 
    reviews_words_index = sequence.pad_sequences(reviews_words_index, maxlen=max_length, padding='post', truncating='post') 
    return reviews_words_index 

def vectorize_labels(labels, nums): 
    labels = np.asarray(labels, dtype='int32') 
    length = len(labels) 
    Y = np.zeros((length, nums)) 
    for i in range(length): 
     Y[i, (labels[i]-1)] = 1. 
    return Y 
# data processing para 
max_words = 5000 
max_length = 50 

# model training parameters 
batch_size = 32 
embedding_dims = 100 
nb_filter = 250 
filter_length = 3 
hidden_dims = 250 
nb_epoch = 2 

# index trick parameters 
index_from = 3 
start = 1 
# padding = 0 
oov = 2 

data = pd.read_csv(
    'review_sub_399850.tsv', header=0, delimiter="\t", quoting=3, encoding='utf-8') 
print('get volcabulary...') 
volcabulary, reviews_words = get_volcabulary_and_list_words(data) 
print('get reviews_words_index...') 
reviews_words_index = get_reviews_word_index(reviews_words, volcabulary, max_words, max_length) 

print reviews_words_index[:20, :12] 
print reviews_words_index.shape 

labels = data["stars"] 

pickle.dump((reviews_words_index, labels), open("399850by50reviews_words_index.pkl", 'wb')) 

(reviews_words_index, labels) = pickle.load(open("399850by50reviews_words_index.pkl", 'rb')) 

index = np.arange(reviews_words_index.shape[0]) 
train_index, valid_index = train_test_split(
    index, train_size=0.8, random_state=100) 

labels = vectorize_labels(labels, 5) 
train_data = reviews_words_index[train_index] 
valid_data = reviews_words_index[valid_index] 
train_labels = labels[train_index] 
valid_labels = labels[valid_index] 
print train_data.shape 
print valid_data.shape 
print train_labels[:10] 

del(labels, train_index, valid_index) 

print "start training model..." 

model = Sequential() 

# we start off with an efficient embedding layer which maps 
# our vocab indices into embedding_dims dimensions 
model.add(Embedding(max_words + index_from, embedding_dims, \ 
        input_length=max_length)) 
model.add(Dropout(0.25)) 

# we add a Convolution1D, which will learn nb_filter 
# word group filters of size filter_length: 

# filter_length is like filter size, subsample_length is like step in 2D CNN. 
model.add(Convolution1D(nb_filter=nb_filter, 
         filter_length=filter_length, 
         border_mode='valid', 
         activation='relu', 
         subsample_length=1)) 
# we use standard max pooling (halving the output of the previous layer): 
model.add(MaxPooling1D(pool_length=2)) 

# We flatten the output of the conv layer, 
# so that we can add a vanilla dense layer: 
model.add(Flatten()) 

# We add a vanilla hidden layer: 
model.add(Dense(hidden_dims)) 
model.add(Dropout(0.25)) 
model.add(Activation('relu')) 

# We project onto 5 unit output layer, and activate it with softmax: 
model.add(Dense(5)) 
model.add(Activation('softmax')) 

model.compile(loss='categorical_crossentropy', 
       optimizer='sgd', 
       class_mode='categorical') 
model.fit(train_data, train_labels, batch_size=batch_size, 
      nb_epoch=nb_epoch, show_accuracy=True, 
      validation_data=(valid_data, valid_labels)) 

Répondre

1

Vos données d'entrée pour le test doivent avoir la forme exacte train_data et valid_data, à l'exception de la première dimension qui correspond à la taille du lot.

Donc, vous devez créer un tableau numpy avec les données d'entrée que vous voulez tester et assurez-vous que ce tableau est structuré exactement comme train_data, étant yourTestArray.shape[1:] exactement les mêmes que train_data.shape[1:], qui est aussi égale à valid_data.shape[1:]. Après avoir ce tableau, vous devez utiliser results = model.predict(yourTestArray).