2017-08-16 1 views
0

Je pense que je vais avoir des problèmes obtenir mon vectorizer travailler dans un pipeline gridsearch:ValueError dans pipeline - featureHasher ne fonctionne pas?

données sous forme de panda df x_train:

bathrooms bedrooms price building_id  manager_id 
10 1.5  3 3000 53a5b119ba8f7b61d4e010512e0dfc85 5ba989232d0489da1b5f2c45f6688adc 
10000 1.0  2 5465 c5c8a357cba207596b04d1afd1e4f130 7533621a882f71e25173b27e3139d83d 
100004 1.0  1 2850 c3ba40552e2120b0acfc3cb5730bb2aa d9039c43983f6e564b1482b273bd7b01 
100007 1.0  1 3275 28d9ad350afeaab8027513a3e52ac8d5 1067e078446a7897d2da493d2f741316 
100013 1.0  4 3350 0 98e13ad4b495b9613cef886d79a6291f 

numeric_predictors = ['bathrooms', 'bedrooms', 'price'] 
categorical_predictors = ['building_id', 'manager_id'] 

minMaxScaler adaptent & transform:

from sklearn.base import BaseEstimator, TransformerMixin 
from sklearn.preprocessing import MinMaxScaler 

class MyScaler(BaseEstimator, TransformerMixin): 

def __init__(self, cols): 
    self.cols = cols 

def fit(self, X, y=None): 

    self.scaler = MinMaxScaler() 
    self.scaler.fit(X[self.cols]) 
    return self 

def transform(self, X): 
    return self.scaler.transform(X[self.cols]) 

Ma fonction catégorique vecteur de hachage:

from sklearn.feature_extraction import FeatureHasher 
from sklearn.feature_extraction.text import HashingVectorizer 

class MyVectorizer(BaseEstimator, TransformerMixin): 
    """ 
    Vectorize a set of categorical variables 
    """ 

    def __init__(self, cols, hashing=None): 
     """ 
     args: 
      cols: a list of column names of the categorical variables 
      hashing: 
       If None, then vectorization is a simple one-hot-encoding. 
       If an integer, then hashing is the number of features in the output. 
     """ 
     self.cols = cols 
     self.hashing = hashing 

    def fit(self, X, y=None): 

     data = X[self.cols] 

     # Choose a vectorizer 
     if self.hashing is None: 
      self.myvec = HashingVectorizer() 
     else: 
      self.myvec = FeatureHasher(n_features = self.hashing) 

     self.myvec.fit(X[self.cols].to_dict(orient='records')) 
     return self 

    def transform(self, X): 

     # Vectorize Input 
     if self.hashing is None: 
      return pd.DataFrame(
       self.myvec.transform(X[self.cols].to_dict(orient='records')), 
       columns = self.myvec.feature_names_ 
      ) 
     else: 
      return pd.DataFrame(
       self.myvec.transform(X[self.cols].to_dict(orient='records')).toarray() 
      ) 

GridSearch hyperparam'etres:

search_params = { 
    'preprocess__vectorize__hashing': [20, 40, 80], 
    'predict__alpha': [.01, .1, 1, 2, 10] 
} 
pipeline

:

from sklearn.pipeline import Pipeline 
from sklearn.pipeline import FeatureUnion 
from sklearn.linear_model import LinearRegression 

pipeline = Pipeline([ 
    ('preprocess', FeatureUnion([ 
     ('scale', MyScaler(cols=numeric_predictors)), 
     ('vectorize', MyVectorizer(cols=categorical_predictors, hashing=5)) 
    ])), 
    ('predict', MultinomialNB()) 
]) 

Enfin, appeler cela avec le classificateur gridsearchCV:

grid_search = GridSearchCV(pipeline, search_params) 
grid_search.fit(x_train, y_train) 

Je reçois un ValueError: entrée X doit être non négatif . J'ai vérifié et les données de mes colonnes numeric_predictor toutes non-négatives, donc je réduis à un problème avec le hachage des prédicteurs catégoriques.

ValueError        Traceback (most recent call last) 
<ipython-input-62-50522376d1e5> in <module>() 
     1 grid_search = GridSearchCV(pipeline, search_params) 
----> 2 grid_search.fit(x_train, y_train) 
     3 grid_search.best_params_ 

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.pyc in fit(self, X, y, groups, **fit_params) 
    636         error_score=self.error_score) 
    637   for parameters, (train, test) in product(candidate_params, 
--> 638             cv.split(X, y, groups))) 
    639 
    640   # if one choose to see train score, "out" will contain train score info 

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable) 
    777    # was dispatched. In particular this covers the edge 
    778    # case of Parallel used with an exhausted iterator. 
--> 779    while self.dispatch_one_batch(iterator): 
    780     self._iterating = True 
    781    else: 

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator) 
    623     return False 
    624    else: 
--> 625     self._dispatch(tasks) 
    626     return True 
    627 

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch) 
    586   dispatch_timestamp = time.time() 
    587   cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self) 
--> 588   job = self._backend.apply_async(batch, callback=cb) 
    589   self._jobs.append(job) 
    590 

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in apply_async(self, func, callback) 
    109  def apply_async(self, func, callback=None): 
    110   """Schedule a func to be run""" 
--> 111   result = ImmediateResult(func) 
    112   if callback: 
    113    callback(result) 

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in __init__(self, batch) 
    330   # Don't delay the application, to avoid keeping the input 
    331   # arguments in memory 
--> 332   self.results = batch() 
    333 
    334  def get(self): 

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self) 
    129 
    130  def __call__(self): 
--> 131   return [func(*args, **kwargs) for func, args, kwargs in self.items] 
    132 
    133  def __len__(self): 

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score) 
    435    estimator.fit(X_train, **fit_params) 
    436   else: 
--> 437    estimator.fit(X_train, y_train, **fit_params) 
    438 
    439  except Exception as e: 

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit(self, X, y, **fit_params) 
    257   Xt, fit_params = self._fit(X, y, **fit_params) 
    258   if self._final_estimator is not None: 
--> 259    self._final_estimator.fit(Xt, y, **fit_params) 
    260   return self 
    261 

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in fit(self, X, y, sample_weight) 
    602   self.feature_count_ = np.zeros((n_effective_classes, n_features), 
    603          dtype=np.float64) 
--> 604   self._count(X, Y) 
    605   alpha = self._check_alpha() 
    606   self._update_feature_log_prob(alpha) 

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in _count(self, X, Y) 
    706   """Count and smooth feature occurrences.""" 
    707   if np.any((X.data if issparse(X) else X) < 0): 
--> 708    raise ValueError("Input X must be non-negative") 
    709   self.feature_count_ += safe_sparse_dot(Y.T, X) 
    710   self.class_count_ += Y.sum(axis=0) 

ValueError: Input X must be non-negative 

> /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.py(708)_count() 
    706   """Count and smooth feature occurrences.""" 
    707   if np.any((X.data if issparse(X) else X) < 0): 
--> 708    raise ValueError("Input X must be non-negative") 
    709   self.feature_count_ += safe_sparse_dot(Y.T, X) 
    710   self.class_count_ += Y.sum(axis=0) 

Répondre

1

Oui, quand hash est None, FeatureHasher() est choisi, ce qui peut les valeurs négatives de sortie.

Mais vous pouvez supprimer convertir ces valeurs négatives à positives en utilisant le paramètre non_negative de FeatureHashser comme indiqué in documentation:

non_negative : boolean, optional, default False

When True, an absolute value is applied to the features matrix prior to returning it. When used in conjunction with alternate_sign=True, this significantly reduces the inner product preservation property.

changer donc cette ligne dans MyVectorizer:

self.myvec = FeatureHasher(n_features = self.hashing) 

à ceci:

self.myvec = FeatureHasher(n_features = self.hashing, non_negative=True) 

Note:

  • Ce paramètre est obsolète depuis la version 0.19 et sera supprimée dans 0,21.
  • Vous devez étudier comment ce paramètre affectera votre problème de classification.
+0

merci. J'ai commencé à regarder LabelEncoder à la place. –