Grid searching data pipelines for wine quality

I collected a small set of crowd sourced wine data from the web to show how pipelines and grid search can be used together to run data analysis while being careful not to leak information from the test set into the training data.

##Load the data

data = response.json()  ## The data is in JSON format
df = pd.DataFrame(data)
df.head(10)
Color Consumed In Country Grape Name Price Region Score Vintage Vinyard
0 W 2015 Portugal Portugal 4 2013 Vinho Verde
1 W 2015 France 17.8 France 3 2013 Peyruchet
2 W 2015 Oregon 20 Oregon 3 2013 Abacela
3 W 2015 Spain chardonay 7 Spain 2.5 2012 Ochoa
4 R 2015 US chiraz, cab Spice Trader 6 3 2012 Heartland
5 R 2015 US cab 13 California 3.5 2012 Crow Canyon
6 R 2015 US #14 21 Oregon 2.5 2013 Abacela
7 R 2015 France merlot, cab 12 Bordeaux 3.5 2012 David Beaulieu
8 R 2015 France merlot, cab 11.99 Medoc 3.5 2011 Chantemerle
9 R 2015 US merlot 13 Washington 4 2011 Hyatt

Split the data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size=0.2, random_state=42)

Set up classes for custom pipeline

class ModelTransformer(BaseEstimator,TransformerMixin):

    def __init__(self, model=None):
        self.model = model

    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self

    def transform(self, X, **transform_params):
        return self.model.transform(X)
    
class SampleExtractor(BaseEstimator, TransformerMixin):
    """Takes in varaible names as a **list**"""

    def __init__(self, vars):
        self.vars = vars  # e.g. pass in a column names to extract

    def transform(self, X, y=None):
        if len(self.vars) > 1:
            return pd.DataFrame(X[self.vars]) # where the actual feature extraction happens
        else:
            return pd.Series(X[self.vars[0]])

    def fit(self, X, y=None):
        return self  # generally does nothing
    
    
class DenseTransformer(BaseEstimator,TransformerMixin):

    def transform(self, X, y=None, **fit_params):
#         print X.todense()
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

Run data with KFolds cross validation

kf_shuffle = StratifiedKFold(n_splits=3,shuffle=True,random_state=777)

binary = True
feats = 5

pipeline = Pipeline([
    ('features', FeatureUnion([
        ('Color', Pipeline([
                      ('text',SampleExtractor(['Color'])),
                      ('dummify', CountVectorizer(binary=binary, max_features=feats)),
                      ('densify', DenseTransformer()),
                     ])),
        ('Country', Pipeline([
                      ('text',SampleExtractor(['Country'])),
                      ('dummify', CountVectorizer(binary=binary, max_features=feats)),
                      ('densify', DenseTransformer()),
                     ])),
        ('Grape', Pipeline([
                      ('text',SampleExtractor(['Grape'])),
                      ('dummify', CountVectorizer(binary=binary, max_features=feats)),
                      ('densify', DenseTransformer()),
                     ])),
        ('Name', Pipeline([
                      ('text',SampleExtractor(['Name'])),
                      ('dummify', CountVectorizer(binary=binary, max_features=feats)),
                      ('densify', DenseTransformer()),
                     ])),
        ('Region', Pipeline([
                      ('text',SampleExtractor(['Region'])),
                      ('dummify', CountVectorizer(binary=binary, max_features=feats)),
                      ('densify', DenseTransformer()),
                     ])),
        ('Vinyard', Pipeline([
                      ('text',SampleExtractor(['Vinyard'])),
                      ('dummify', CountVectorizer(binary=binary, max_features=feats)),
                      ('densify', DenseTransformer()),
                     ])),
        ('cont_features', Pipeline([
                      ('continuous', SampleExtractor(['Consumed In', 'Price', 'Vintage'])),
                      ('impute',Imputer()),
                      ])),
        ])),
        ('scale', ModelTransformer()),
        ('tree', tree.DecisionTreeRegressor()),
])


parameters = {
    'features__Color__dummify__analyzer':['char'],
    'scale__model': (StandardScaler(),MinMaxScaler()),
    'tree__max_depth': (2,3,4,None),
    'tree__min_samples_split': (2,3,4,5),
}

grid_search = GridSearchCV(pipeline, parameters, verbose=False, cv=kf_shuffle)

Execute the pipeline

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
print(parameters)


grid_search.fit(X_train, y_train)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


cv_pred = pd.Series(grid_search.predict(X_test))
Performing grid search...
('pipeline:', ['features', 'scale', 'tree'])
parameters:
{'tree__min_samples_split': (2, 3, 4, 5), 'tree__max_depth': (2, 3, 4, None), 'scale__model': (StandardScaler(copy=True, with_mean=True, with_std=True), MinMaxScaler(copy=True, feature_range=(0, 1))), 'features__Color__dummify__analyzer': ['char']}
Best score: 0.964
Best parameters set:
	features__Color__dummify__analyzer: 'char'
	scale__model: StandardScaler(copy=True, with_mean=True, with_std=True)
	tree__max_depth: 2
	tree__min_samples_split: 3

Take a peek at the results

pd.DataFrame(zip(grid_search.cv_results_['mean_test_score'],\
                 grid_search.cv_results_['std_test_score']\
                )).sort_values(0,ascending=False).head(10)

0 1
17 0.963752 0.009070
1 0.963752 0.009070
20 0.958414 0.010273
13 0.955670 0.017514
15 0.955079 0.007642
27 0.955020 0.010817
6 0.954595 0.010570
26 0.952498 0.011002
9 0.838367 0.134405
22 0.835760 0.139155
grid_search.score(X_test,y_test) # prints the R2 for the best predictor
0.46448936196721591
plt.scatter(y_test,cv_pred,color='r')
plt.plot(y_test,y_test,color='k')
plt.xlabel("True value")
plt.ylabel("Predicted Value")
plt.show()

png

plt.scatter(y_test,y_test.values-cv_pred.values,color='r')
plt.plot(y_test,y_test-y_test,color='k')
plt.xlabel("True value")
plt.ylabel("Residual")
plt.show()

png

Results Summary

I was able to run a grid search on the data, and found that two models fit the training data equally well. When applied as a predictor on the test set, I get an R-squared value of 46.4%. Looking at the plots, I seem to be over fit and predicting a wild outlier at the right side. There is also a linear pattern to my residuals from one model, but the small sample size makes it hard to predict if that is by chance.