Je suis assez nouveau à ce sujet et j'ai vu d'autres ont la même erreur, mais je ne vois pas comment je peux mettre en œuvre les solutions. J'essaye d'écrire une méthode d'apprentissage de machine de forêt aléatoire en utilisant une recherche aléatoire de grille de scikit learn. Cela fonctionne bien avec une recherche de grille standard, mais échoue avec une erreur étrange dans la fonction d'ajustement de scikit learn lorsque j'utilise la recherche de grille aléatoire. Toutes les suggestions sur la façon d'aborder ce serait grandePython TypeError: range() entier argument de fin attendu, got float. avec fonction d'ajustement
Voici un exemple qui affiche l'erreur.
import scipy
import math
import numpy as np
import pandas as pd
import plotly.plotly as py
from time import time
from sklearn import preprocessing, metrics, cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.cross_validation import KFold
data = pd.read_csv("data.csv", sep=",")
data = SubFeAll.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor
header = data.columns.values # Ues the column headers as the descriptor labels
data.head()
# Set the numpy global random number seed (similar effect to random_state)
np.random.seed(1)
# Random Forest results initialised
RFr2 = []
RFmse = []
RFrmse = []
# Predictions results initialised
RFpredictions = []
metcount = 0
# Give the array from pandas to numpy
npArray = np.array(data)
print header.shape
npheader = np.array(header[1:-1])
print("Array shape X = %d, Y = %d " % (npArray.shape))
datax, datay = npArray.shape
# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point
names = npArray[:,0]
X = npArray[:,1:-1].astype(float)
y = npArray[:,-1] .astype(float)
X = preprocessing.scale(X)
print X.shape
# Open output files
train_name = "Training.csv"
fi_name = "Feature_importance.csv"
with open(train_name,'w') as ftrain:
ftrain.write("This file contains the training information for all three models (Random Forest, Support Vector Regression and Partial Least Squares),\n")
ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,\n")
ftrain.write("Fold %d ,\n" %(metcount+1))
ftrain.close()
with open(fi_name,'w') as ffeatimp:
ffeatimp.write("This file contains the feature importance information for the Random Forest model,\n")
ffeatimp.close()
# Begin the K-fold cross validation over ten folds
kf = KFold(datax, n_folds=10)
print "------------------- Begining Ten Fold Cross Validation -------------------"
for train, test in kf:
XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test]
ytestdim = yTest.shape[0]
i = 0
with open (train_name, 'a') as ftrain:
while i< ytestdim :
ftrain.write(str(round(yTest[i],2))+',\n')
i += 1
ftrain.close()
print "\n"
# random forest grid search parameters
print "------------------- Begining Random Forest Grid Search -------------------"
rfparamgrid = {"n_estimators": scipy.stats.expon(scale=100), "max_features": ["auto", "sqrt", "log2"], "max_depth": scipy.stats.expon(scale=100)}
rf = RandomForestRegressor(random_state=0,n_jobs=2)
RfGridSearch = RandomizedSearchCV(rf,param_distributions=rfparamgrid,scoring='mean_squared_error',n_iter=20)
start = time()
RfGridSearch.fit(XTrain,yTrain)
# Get best random forest parameters
print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_)))
RFtime = time() - start,len(RfGridSearch.grid_scores_)
report(RfGridSearch.grid_scores_)
print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators'])
ne = RfGridSearch.best_params_['n_estimators']
print("max_features = %s " % RfGridSearch.best_params_['max_features'])
mf = RfGridSearch.best_params_['max_features']
print("max_depth = %d " % RfGridSearch.best_params_['max_depth'])
md = RfGridSearch.best_params_['max_depth']
with open (train_name, 'a') as ftrain:
ftrain.write("Random Forest")
ftrain.write("RF search time, %s ,\n" % (str(RFtime)))
ftrain.write("Number of Trees, %s ,\n" % str(ne))
ftrain.write("Number of feature at split, %s ,\n" % str(mf))
ftrain.write("Max depth of tree, %s ,\n" % str(md))
ftrain.close()
L'erreur qui est donnée est inférieure
Traceback (most recent call last):
File "rgscv.py", line 81, in <module>
RfGridSearch.fit(XTrain,yTrain)
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py", line 996, in fit
return self._fit(X, y, sampled_params)
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py", line 553, in _fit
for parameters in parameter_iterable
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 800, in __call__
while self.dispatch_one_batch(iterator):
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 658, in dispatch_one_batch
self._dispatch(tasks)
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 566, in _dispatch
job = ImmediateComputeBatch(batch)
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 180, in __init__
self.results = batch()
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 72, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py", line 1531, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 276, in fit
for i in range(n_more_estimators):
TypeError: range() integer end argument expected, got float.
Au début, je pensais que je venais de manquer un paramètre mais cette méthode exacte avec une recherche de grille avant droite semblent fonctionner sans problème. Le code pour cela est ci-dessous. Quelqu'un peut-il me suggérer ce qui cause cette erreur?
import scipy
import math
import numpy as np
import pandas as pd
import plotly.plotly as py
from time import time
from sklearn import preprocessing, metrics, cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.cross_validation import KFold
data = pd.read_csv("data.csv", sep=",")
data = data.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor
header = data.columns.values # Ues the column headers as the descriptor labels
data.head()
# Set the numpy global random number seed (similar effect to random_state)
np.random.seed(1)
# Random Forest results initialised
RFr2 = []
RFmse = []
RFrmse = []
# Predictions results initialised
RFpredictions = []
metcount = 0
# Give the array from pandas to numpy
npArray = np.array(data)
print header.shape
npheader = np.array(header[1:-1])
print("Array shape X = %d, Y = %d " % (npArray.shape))
datax, datay = npArray.shape
# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point
names = npArray[:,0]
X = npArray[:,1:-1].astype(float)
y = npArray[:,-1] .astype(float)
X = preprocessing.scale(X)
print X.shape
# Open output files
train_name = "Training.csv"
fi_name = "Feature_importance.csv"
with open(train_name,'w') as ftrain:
ftrain.write("This file contains the training information for all three models (Random Forest, Support Vector Regression and Partial Least Squares),\n")
ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,\n")
ftrain.write("Fold %d ,\n" %(metcount+1))
ftrain.close()
with open(fi_name,'w') as ffeatimp:
ffeatimp.write("This file contains the feature importance information for the Random Forest model,\n")
ffeatimp.close()
# Begin the K-fold cross validation over ten folds
kf = KFold(datax, n_folds=10)
print "------------------- Begining Ten Fold Cross Validation -------------------"
for train, test in kf:
XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test]
ytestdim = yTest.shape[0]
i = 0
with open (train_name, 'a') as ftrain:
while i< ytestdim :
ftrain.write(str(round(yTest[i],2))+',\n')
i += 1
ftrain.close()
print "\n"
# random forest grid search parameters
print "------------------- Begining Random Forest Grid Search -------------------"
#rfparamgrid = {"n_estimators": scipy.stats.expon(scale=100), "max_features": ["auto", "sqrt", "log2"], "max_depth": scipy.stats.expon(scale=100)}
rfparamgrid = {"n_estimators": [10, 20, 25, 50, 100, 1000], "max_features": ["auto", "sqrt", "log2"], "max_depth": [1,2,3,5,7,10]}
rf = RandomForestRegressor(random_state=0,n_jobs=2)
RfGridSearch = GridSearchCV(rf,param_grid=rfparamgrid,scoring='mean_squared_error')
start = time()
RfGridSearch.fit(XTrain,yTrain)
# Get best random forest parameters
print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_)))
RFtime = time() - start,len(RfGridSearch.grid_scores_)
report(RfGridSearch.grid_scores_)
print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators'])
ne = RfGridSearch.best_params_['n_estimators']
print("max_features = %s " % RfGridSearch.best_params_['max_features'])
mf = RfGridSearch.best_params_['max_features']
print("max_depth = %d " % RfGridSearch.best_params_['max_depth'])
md = RfGridSearch.best_params_['max_depth']
with open (train_name, 'a') as ftrain:
ftrain.write("Random Forest")
ftrain.write("RF search time, %s ,\n" % (str(RFtime)))
ftrain.write("Number of Trees, %s ,\n" % str(ne))
ftrain.write("Number of feature at split, %s ,\n" % str(mf))
ftrain.write("Max depth of tree, %s ,\n" % str(md))
ftrain.close()
@coralv Il utilise clairement une librairie, c'est dans le répertoire site-packages. Ne posez pas de questions absurdes. – Natecat
Cette section de code est la fonction d'ajustement standard de la bibliothèque scikit learn. Ce n'est pas un code que j'ai édité du tout et la même fonction est utilisée avec succès dans la version GridSearchCV, qui fonctionne comme prévu. – James
[scipy.stats.expon] (http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.expon.html#scipy.stats.expon) semble renvoyer un objet expon, qui doesn Cela ne semble pas fonctionner comme une liste comme dans le second exemple. Est-ce que le fait de changer ça en une liste le corrige? – Natecat