2017-05-20 3 views
1

Ma première classification multiclasse. J'ai des valeurs Xtrn et Ytrn. Ytrn a 5 valeurs [0,1,2,3,4]. Mais si je commence alors obtenir "le format multiclasse n'est pas supporté". Cet exemple de valeurs:Classification multi-classe dans xgboost (python)

      Xtrn        Ytrn 
-1.35173485 1.50224188 2.04951167 0.43759658 0.24381777  2 
2.81047260 1.31259056 1.39265240 0.16384002 0.65438366  3 
2.32878809 -1.92845940 -2.06453246 0.73132270 0.11771229  2 
-0.12810555 -2.07268765 -2.40760215 0.97855042 0.11144164  1 
1.88682063 0.75792329 -0.09754671 0.46571931 0.62111648  2 
-1.09361266 1.74758304 2.49960891 0.36679883 0.88895562  2 
0.71760095 -1.30711698 -2.15681966 0.33700593 0.07171119  2 
4.60060308 -1.60544855 -1.88996123 0.94500124 0.63776116  4 
-0.84223064 2.78233537 3.07299711 0.31470071 0.34424704  1 
-0.71236435 0.53140549 0.46677096 0.12320728 0.58829090  2 
-0.35333909 1.12463059 1.70104349 0.89084673 0.16585229  2 
3.04322100 -1.36878116 -2.31056167 0.81178387 0.04095645  1 
-1.04088918 -1.97497570 -1.93285343 0.54101882 0.02528487  1 
-0.41624939 0.54592833 0.95458283 0.40004902 0.55062705  2 
-1.77706795 0.29061278 0.68186697 0.17430716 0.75095729  0 

Ceci est du code:

#import data 
import pandas as pd 
import numpy as np 
from sklearn.cross_validation import train_test_split 
import xgboost as xgb 
from sklearn import metrics, cross_validation, grid_search, preprocessing 
Xtrn = pd.read_csv('x_train_secret.csv', header=None, delimiter=';', na_values='?') 
Ytrn = pd.read_csv('y_train_secret.csv', header=None) 
Test = pd.read_csv('x_test_secret.csv', header=None, delimiter=';', na_values='?') 

#Number of unique values Ytrn 
n_classes_ = len(np.unique(Ytrn)) 

#learning model 
X_train, X_test, y_train, y_test = train_test_split(Xtrn, Ytrn, test_size=0.30, random_state=42) 

xgb_model = xgb.XGBClassifier(objective='multi:softmax') 

xgb_params = [{'num_class': n_classes_}] 
xgb_params = [ 
    {  
    "n_estimators": range(50, 501, 50), 
    } 
] 
#cv 
cv = cross_validation.StratifiedShuffleSplit(y_train, n_iter=5, test_size=0.3, random_state=42) 

xgb_grid = grid_search.GridSearchCV(xgb_model, xgb_params, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=3) 
xgb_grid.fit(X_train, y_train) 

Ceci est une erreur:

Fitting 5 folds for each of 10 candidates, totalling 50 fits 
[CV] n_estimators=50 ................................................. 
--------------------------------------------------------------------------- 
ValueError        Traceback (most recent call last) 
<ipython-input-233-77d3e8d4b8c3> in <module>() 
    10 
    11 xgb_grid = grid_search.GridSearchCV(xgb_model, xgb_params, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=3) 
---> 12 xgb_grid.fit(X_train, y_train) 

/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.pyc in fit(self, X, y) 
    827 
    828   """ 
--> 829   return self._fit(X, y, ParameterGrid(self.param_grid)) 
    830 
    831 

/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.pyc in _fit(self, X, y, parameter_iterable) 
    571          self.fit_params, return_parameters=True, 
    572          error_score=self.error_score) 
--> 573     for parameters in parameter_iterable 
    574     for train, test in cv) 
    575 

/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable) 
    756    # was dispatched. In particular this covers the edge 
    757    # case of Parallel used with an exhausted iterator. 
--> 758    while self.dispatch_one_batch(iterator): 
    759     self._iterating = True 
    760    else: 

/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator) 
    606     return False 
    607    else: 
--> 608     self._dispatch(tasks) 
    609     return True 
    610 

/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch) 
    569   dispatch_timestamp = time.time() 
    570   cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self) 
--> 571   job = self._backend.apply_async(batch, callback=cb) 
    572   self._jobs.append(job) 
    573 

/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in apply_async(self, func, callback) 
    107  def apply_async(self, func, callback=None): 
    108   """Schedule a func to be run""" 
--> 109   result = ImmediateResult(func) 
    110   if callback: 
    111    callback(result) 

/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in __init__(self, batch) 
    324   # Don't delay the application, to avoid keeping the input 
    325   # arguments in memory 
--> 326   self.results = batch() 
    327 
    328  def get(self): 

/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self) 
    129 
    130  def __call__(self): 
--> 131   return [func(*args, **kwargs) for func, args, kwargs in self.items] 
    132 
    133  def __len__(self): 

/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score) 
    1682 
    1683  else: 
-> 1684   test_score = _score(estimator, X_test, y_test, scorer) 
    1685   if return_train_score: 
    1686    train_score = _score(estimator, X_train, y_train, scorer) 

/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _score(estimator, X_test, y_test, scorer) 
    1739   score = scorer(estimator, X_test) 
    1740  else: 
-> 1741   score = scorer(estimator, X_test, y_test) 
    1742  if hasattr(score, 'item'): 
    1743   try: 

/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.pyc in __call__(self, clf, X, y, sample_weight) 
    169   y_type = type_of_target(y) 
    170   if y_type not in ("binary", "multilabel-indicator"): 
--> 171    raise ValueError("{0} format is not supported".format(y_type)) 
    172 
    173   if is_regressor(clf): 

ValueError: multiclass format is not supported 

Répondre

1

J'ai trouvé réponse. Scoring = 'roc_auc' onle pour la classification binaire. Besoin d'un autre (ex. Précision) xgb_params = [{'num_class': n_classes_}] besoin de supprimer