2016-05-03 1 views
0

Je suis assez nouveau à ce sujet et j'ai vu d'autres ont la même erreur, mais je ne vois pas comment je peux mettre en œuvre les solutions. J'essaye d'écrire une méthode d'apprentissage de machine de forêt aléatoire en utilisant une recherche aléatoire de grille de scikit learn. Cela fonctionne bien avec une recherche de grille standard, mais échoue avec une erreur étrange dans la fonction d'ajustement de scikit learn lorsque j'utilise la recherche de grille aléatoire. Toutes les suggestions sur la façon d'aborder ce serait grandePython TypeError: range() entier argument de fin attendu, got float. avec fonction d'ajustement

Voici un exemple qui affiche l'erreur.

import scipy 
import math 
import numpy as np 
import pandas as pd 
import plotly.plotly as py 

from time import time 
from sklearn import preprocessing, metrics, cross_validation 
from sklearn.cross_validation import train_test_split 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV 
from sklearn.cross_validation import KFold 

data = pd.read_csv("data.csv", sep=",") 
data = SubFeAll.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor 
header = data.columns.values # Ues the column headers as the descriptor labels 
data.head() 

# Set the numpy global random number seed (similar effect to random_state) 
np.random.seed(1) 

# Random Forest results initialised 
RFr2 = [] 
RFmse = [] 
RFrmse = [] 

# Predictions results initialised 
RFpredictions = [] 

metcount = 0 

# Give the array from pandas to numpy 
npArray = np.array(data) 
print header.shape 
npheader = np.array(header[1:-1]) 
print("Array shape X = %d, Y = %d " % (npArray.shape)) 
datax, datay = npArray.shape 

# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point 
names = npArray[:,0] 
X = npArray[:,1:-1].astype(float) 
y = npArray[:,-1] .astype(float) 
X = preprocessing.scale(X) 
print X.shape 

# Open output files 
train_name = "Training.csv" 
fi_name = "Feature_importance.csv" 

with open(train_name,'w') as ftrain: 
     ftrain.write("This file contains the training information for all three models (Random Forest, Support Vector Regression and Partial Least Squares),\n") 
     ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,\n") 
     ftrain.write("Fold %d ,\n" %(metcount+1)) 
ftrain.close() 

with open(fi_name,'w') as ffeatimp: 
     ffeatimp.write("This file contains the feature importance information for the Random Forest model,\n") 
ffeatimp.close() 

# Begin the K-fold cross validation over ten folds 
kf = KFold(datax, n_folds=10) 
print "------------------- Begining Ten Fold Cross Validation -------------------" 
for train, test in kf: 
    XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test] 
    ytestdim = yTest.shape[0] 
    i = 0 
    with open (train_name, 'a') as ftrain: 
     while i< ytestdim : 
       ftrain.write(str(round(yTest[i],2))+',\n') 
       i += 1 
    ftrain.close() 

    print "\n" 
    # random forest grid search parameters 
    print "------------------- Begining Random Forest Grid Search -------------------" 
    rfparamgrid = {"n_estimators": scipy.stats.expon(scale=100), "max_features": ["auto", "sqrt", "log2"], "max_depth": scipy.stats.expon(scale=100)} 
    rf = RandomForestRegressor(random_state=0,n_jobs=2) 
    RfGridSearch = RandomizedSearchCV(rf,param_distributions=rfparamgrid,scoring='mean_squared_error',n_iter=20) 
    start = time() 
    RfGridSearch.fit(XTrain,yTrain) 

    # Get best random forest parameters 
    print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_))) 
    RFtime = time() - start,len(RfGridSearch.grid_scores_) 
    report(RfGridSearch.grid_scores_) 
    print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators']) 
    ne = RfGridSearch.best_params_['n_estimators'] 
    print("max_features = %s " % RfGridSearch.best_params_['max_features']) 
    mf = RfGridSearch.best_params_['max_features'] 
    print("max_depth = %d " % RfGridSearch.best_params_['max_depth']) 
    md = RfGridSearch.best_params_['max_depth'] 
    with open (train_name, 'a') as ftrain: 
      ftrain.write("Random Forest") 
      ftrain.write("RF search time, %s ,\n" % (str(RFtime))) 
      ftrain.write("Number of Trees, %s ,\n" % str(ne)) 
      ftrain.write("Number of feature at split, %s ,\n" % str(mf)) 
      ftrain.write("Max depth of tree, %s ,\n" % str(md)) 
    ftrain.close() 

L'erreur qui est donnée est inférieure

Traceback (most recent call last): 
    File "rgscv.py", line 81, in <module> 
    RfGridSearch.fit(XTrain,yTrain) 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py", line 996, in fit 
    return self._fit(X, y, sampled_params) 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py", line 553, in _fit 
    for parameters in parameter_iterable 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 800, in __call__ 
    while self.dispatch_one_batch(iterator): 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 658, in dispatch_one_batch 
    self._dispatch(tasks) 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 566, in _dispatch 
    job = ImmediateComputeBatch(batch) 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 180, in __init__ 
    self.results = batch() 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 72, in __call__ 
    return [func(*args, **kwargs) for func, args, kwargs in self.items] 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py", line 1531, in _fit_and_score 
    estimator.fit(X_train, y_train, **fit_params) 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 276, in fit 
    for i in range(n_more_estimators): 
TypeError: range() integer end argument expected, got float. 

Au début, je pensais que je venais de manquer un paramètre mais cette méthode exacte avec une recherche de grille avant droite semblent fonctionner sans problème. Le code pour cela est ci-dessous. Quelqu'un peut-il me suggérer ce qui cause cette erreur?

import scipy 
import math 
import numpy as np 
import pandas as pd 
import plotly.plotly as py 

from time import time 
from sklearn import preprocessing, metrics, cross_validation 
from sklearn.cross_validation import train_test_split 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV 
from sklearn.cross_validation import KFold 

data = pd.read_csv("data.csv", sep=",") 
data = data.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor 
header = data.columns.values # Ues the column headers as the descriptor labels 
data.head() 

# Set the numpy global random number seed (similar effect to random_state) 
np.random.seed(1) 

# Random Forest results initialised 
RFr2 = [] 
RFmse = [] 
RFrmse = [] 

# Predictions results initialised 
RFpredictions = [] 

metcount = 0 

# Give the array from pandas to numpy 
npArray = np.array(data) 
print header.shape 
npheader = np.array(header[1:-1]) 
print("Array shape X = %d, Y = %d " % (npArray.shape)) 
datax, datay = npArray.shape 

# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point 
names = npArray[:,0] 
X = npArray[:,1:-1].astype(float) 
y = npArray[:,-1] .astype(float) 
X = preprocessing.scale(X) 
print X.shape 

# Open output files 
train_name = "Training.csv" 
fi_name = "Feature_importance.csv" 

with open(train_name,'w') as ftrain: 
     ftrain.write("This file contains the training information for all three models (Random Forest, Support Vector Regression and Partial Least Squares),\n") 
     ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,\n") 
     ftrain.write("Fold %d ,\n" %(metcount+1)) 
ftrain.close() 

with open(fi_name,'w') as ffeatimp: 
     ffeatimp.write("This file contains the feature importance information for the Random Forest model,\n") 
ffeatimp.close() 

# Begin the K-fold cross validation over ten folds 
kf = KFold(datax, n_folds=10) 
print "------------------- Begining Ten Fold Cross Validation -------------------" 
for train, test in kf: 
    XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test] 
    ytestdim = yTest.shape[0] 
    i = 0 
    with open (train_name, 'a') as ftrain: 
     while i< ytestdim : 
       ftrain.write(str(round(yTest[i],2))+',\n') 
       i += 1 
    ftrain.close() 

    print "\n" 
    # random forest grid search parameters 
    print "------------------- Begining Random Forest Grid Search -------------------" 
    #rfparamgrid = {"n_estimators": scipy.stats.expon(scale=100), "max_features": ["auto", "sqrt", "log2"], "max_depth": scipy.stats.expon(scale=100)} 
    rfparamgrid = {"n_estimators": [10, 20, 25, 50, 100, 1000], "max_features": ["auto", "sqrt", "log2"], "max_depth": [1,2,3,5,7,10]} 
    rf = RandomForestRegressor(random_state=0,n_jobs=2) 
    RfGridSearch = GridSearchCV(rf,param_grid=rfparamgrid,scoring='mean_squared_error') 
    start = time() 
    RfGridSearch.fit(XTrain,yTrain) 

    # Get best random forest parameters 
    print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_))) 
    RFtime = time() - start,len(RfGridSearch.grid_scores_) 
    report(RfGridSearch.grid_scores_) 
    print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators']) 
    ne = RfGridSearch.best_params_['n_estimators'] 
    print("max_features = %s " % RfGridSearch.best_params_['max_features']) 
    mf = RfGridSearch.best_params_['max_features'] 
    print("max_depth = %d " % RfGridSearch.best_params_['max_depth']) 
    md = RfGridSearch.best_params_['max_depth'] 
    with open (train_name, 'a') as ftrain: 
       ftrain.write("Random Forest") 
       ftrain.write("RF search time, %s ,\n" % (str(RFtime))) 
       ftrain.write("Number of Trees, %s ,\n" % str(ne)) 
       ftrain.write("Number of feature at split, %s ,\n" % str(mf)) 
       ftrain.write("Max depth of tree, %s ,\n" % str(md)) 
    ftrain.close() 
+0

@coralv Il utilise clairement une librairie, c'est dans le répertoire site-packages. Ne posez pas de questions absurdes. – Natecat

+1

Cette section de code est la fonction d'ajustement standard de la bibliothèque scikit learn. Ce n'est pas un code que j'ai édité du tout et la même fonction est utilisée avec succès dans la version GridSearchCV, qui fonctionne comme prévu. – James

+0

[scipy.stats.expon] (http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.expon.html#scipy.stats.expon) semble renvoyer un objet expon, qui doesn Cela ne semble pas fonctionner comme une liste comme dans le second exemple. Est-ce que le fait de changer ça en une liste le corrige? – Natecat

Répondre

1

Nombre d'estimateurs doit être entier et votre code produit flotte. Créer une liste valide de n_estimators valeurs qui contient des entiers, et ce sera très bien.

+0

Merci qui l'a trié. – James