Je suis très nouveau à ce sujet, donc toute sorte d'information serait utile. Excuses, si j'ai posé une question très triviale. Je travaille sur un ensemble de données de taille moyenne avec beaucoup de zéros. Nous avons appliqué beaucoup de modèles et le score de cv-skf pour k = 10 a franchi 0.85 mais le score de roc_auc est bloqué autour de 0.5. J'utilise sklearn. Voici l'extrait de code ci-dessous.Obtenir un bon score de validation croisée mais un très mauvais score Roc_auc
train_dataset = pd.read_csv('./input/train.csv', index_col='ID')
test_dataset = pd.read_csv('./input/test.csv', index_col='ID')
#print_shapes()
# How many nulls are there in the datasets?
nulls_train = (train_dataset.isnull().sum()==1).sum()
nulls_test = (test_dataset.isnull().sum()==1).sum()
#print('There are {} nulls in TRAIN and {} nulls in TEST dataset.'.format(nulls_train, nulls_test))
# Remove constant features
def identify_constant_features(dataframe):
count_uniques = dataframe.apply(lambda x: len(x.unique()))
constants = count_uniques[count_uniques == 1].index.tolist()
return constants
constant_features_train = set(identify_constant_features(train_dataset))
#print('There were {} constant features in TRAIN dataset.'.format(len(constant_features_train)))
# Drop the constant features
train_dataset.drop(constant_features_train, inplace=True, axis=1)
#print_shapes()
# Remove equals features
def identify_equal_features(dataframe):
features_to_compare = list(combinations(dataframe.columns.tolist(),2))
equal_features = []
for compare in features_to_compare:
is_equal = array_equal(dataframe[compare[0]],dataframe[compare[1]])
if is_equal:
equal_features.append(list(compare))
return equal_features
equal_features_train = identify_equal_features(train_dataset)
#print('There were {} pairs of equal features in TRAIN dataset.'.format(len(equal_features_train)))
# Remove the second feature of each pair.
features_to_drop = array(equal_features_train)[:,1]
train_dataset.drop(features_to_drop, axis=1, inplace=True)
#print_shapes()
# Define the variables model.
y_name = 'TARGET'
feature_names = train_dataset.columns.tolist()
feature_names.remove(y_name)
X = train_dataset[feature_names]
y = train_dataset[y_name]
# Save the features selected for later use.
pd.Series(feature_names).to_csv('features_selected_step1.csv', index=False)
#print('Features selected\n{}'.format(feature_names))
# Proportion of classes
y.value_counts()/len(y)
skf = cv.StratifiedKFold(y, n_folds=10, shuffle=True)
score_metric = 'roc_auc'
scores = {}
def score_model(model):
return cv.cross_val_score(model, X, y, cv=skf, scoring=score_metric)
clfxgb = xgb.XGBClassifier()
clfxgb = clfxgb.fit(X, y)
probxgb = clfxgb.predict(X)
# #print 'XGB', np.shape(probxgb)
print metrics.roc_auc_score(y, probxgb)
sortie - Remplissage de l'espace de noms interactive de numpy et matplotlib test.csv train.csv
0,502140359687
Pour cv-skf -
cv.cross_val_score(xgb.XGBClassifier(), X, y, cv=skf, scoring=score_metric)
sortie - array ([0,83124251, 0,84162387, 0,83580491])
Nous soumettons le fichier .csv que -
test_dataset.drop(constant_features_train, inplace=True, axis=1)
test_dataset.drop(features_to_drop, axis=1, inplace=True)
print test_dataset.shape
X_SubTest = test_dataset
df_test = pd.read_csv('./input/test.csv')
id_test = df_test['ID']
predTest = model.predict(X_SubTest)
submission = pd.DataFrame({"ID":id_test, "TARGET":predTest})
submission.to_csv("submission_svm_23-3.csv", index=False)