https://www.kaggle.com/c/dsi-us-5-project-2-regression-challenge/data
For this project we are tasked with making sales price predictions for house sales in Ames, Iowa, based on certain characteristics.
As with all data science projects, the first step is to manage the data that we are given. We:
# Let's get the administrative stuff done first
# import all the libraries and set up the plotting
import datetime
import pandas as pd
import numpy as np
import scipy.stats as stats
from itertools import combinations
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
sns.set(style="white", color_codes=True)
colors_palette = sns.color_palette("GnBu_d")
sns.set_palette(colors_palette)
# "GnBu_d" color palette
colors_str = ['#37535e', '#3b748a', '#4095b5', '#52aec9', '#72bfc4', '#93d0bf', '#c5e5eaff']
# Seaborn is behind matplotlib on updates
import warnings
warnings.filterwarnings('ignore')
These functions, when called, check the consistency of the data and determine how many NaNs we have to deal with.
# Check that sales prices are striclty positive
def check_y(y):
y_min = y.min()
if y_min <= 0:
print("y should be strictly positive.")
n_nan = y.isnull().sum()
if n_nan > 0:
print("y has {} NaNs".format(n_nan))
print("y: min: {} mean: {} max: {} NaNs: {}".format(y_min, y.mean(), y.max(),n_nan))
return
# Check that all the PIDs and Ids are unique
def check_ids(df):
# Unique PIDs
n_rows = df.shape[0]
n_pids = len(df['PID'].unique())
if n_pids != n_rows:
print("# PIDs {} but {} rows.".format(n_pids, n_rows))
# Unique IDs
n_ids = len(df['Id'].unique())
if n_ids != n_rows:
print("# Ids {} but {} rows.".format(n_ids, n_rows))
return
# Check which non-numeric columns are missing values and what the possible values are for each object column
def check_cols(df):
check_cols = df.select_dtypes([np.object]).columns
for col in check_cols:
# print("Values of {} are {}.".format(col,df[col].unique()))
n_nan = df[col].isnull().sum()
if n_nan > 0:
print("{} has {} NaNs".format(col,n_nan))
return
# Check which numeric columns are missing values
def check_data(df):
s = df.shape
print("Rows: {} Cols: {}".format(s[0],s[1]))
# Check for null values
null_data = df.isnull().sum()
null_data_count = sum(df.isnull().sum())
if null_data_count > 0:
print("There are {} null data.".format(null_data_count))
print("Columns with NaN: {}".format(list(null_data[null_data > 0].index)))
check_cols(df)
return
These functions fill the NaNs with appropriate values. Given more time, we could make these funcions more robust.
# Change the NaNs in numeric columns to a value ?
def clean_data(df, val):
numeric_cols = df.select_dtypes([np.int64,np.float64,np.uint64]).columns
df[numeric_cols] = df[numeric_cols].fillna(val)
return
# Check values of categries and fill the NaNs with a specified value.
def clean_check_col(df, col,possible_values,fill_value):
df[col] = df[col].fillna(fill_value)
values = set(df[col])
if not values.issubset(possible_values):
print("Problem with {}.".format(col))
return
# Turn "quality" columns into numeric ones
def clean_quality_col(df, col, dict_values,fill_value):
possible_values = dict_values.keys()
clean_check_col(df, col,possible_values,fill_value)
for key,value in dict_values.items():
df[col] = df[col].map(lambda cell: cell.replace(key,str(value)))
df[col] = df[col].astype(int)
return
The training data is in ../data/train.csv and the test data is in ../data/test.csv.
# Read in the data from the csv files
# Check the Ids and set Id as the index
train_csv = '../data/train.csv'
df_train = pd.read_csv(train_csv)
check_ids(df_train)
df_train.set_index("Id", inplace=True)
kaggle_csv = '../data/test.csv'
df_kaggle = pd.read_csv(kaggle_csv)
check_ids(df_kaggle)
df_kaggle.set_index("Id", inplace=True)
# For the training data:
# Check y: if y has negative values or NaNs should throw error
y_label = 'SalePrice'
y = df_train[y_label]
check_y(y)
df_train.drop(columns=y_label, inplace=True)
print("Training data:")
check_data(df_train)
print("")
print("Target Kaggle data:")
check_data(df_kaggle)
df_train.describe()
df_kaggle.describe()
We see that there are lots of NaNs to deal with as well as some outliers in the data.
# Keep track of columns that have been cleaned
cleaned_cols = []
na_val = -999
# These could probably be even more generalized ...
quality_cols = ['Exter Qual','Exter Cond','Kitchen Qual','Bsmt Qual','Heating QC','Fireplace Qu',
'Garage Qual','Garage Cond','Bsmt Cond','Pool QC']
dict_values = {'Ex': 5,
'Gd' : 4,
'TA' : 3,
'Fa' : 2,
'Po' : 1,
'NA' : na_val }
for col in quality_cols:
clean_quality_col(df_train, col, dict_values, "NA")
clean_quality_col(df_kaggle, col, dict_values, "NA")
cleaned_cols = cleaned_cols + quality_cols
col = 'Land Slope'
dict_values = {'Gtl' : 3,
'Mod' : 2,
'Sev' : 1,
'NA' : na_val }
clean_quality_col(df_train, col, dict_values, "NA")
clean_quality_col(df_kaggle, col, dict_values, "NA")
cleaned_cols.append(col)
col = 'Bsmt Exposure'
dict_values = {'Gd' : 4,
'Av' : 3,
'Mn' : 2,
'No' : 1,
'NA' : na_val }
clean_quality_col(df_train, col, dict_values, "NA")
clean_quality_col(df_kaggle, col, dict_values, "NA")
cleaned_cols.append(col)
col = 'Garage Finish'
dict_values = {'Fin' : 3,
'RFn' : 2,
'Unf' : 1,
'NA' : na_val }
clean_quality_col(df_train, col, dict_values, "NA")
clean_quality_col(df_kaggle, col, dict_values, "NA")
cleaned_cols.append(col)
col = 'Central Air'
dict_values = {'Y' : 1,
'N' : na_val }
clean_quality_col(df_train, col, dict_values, "N")
clean_quality_col(df_kaggle, col, dict_values, "N")
cleaned_cols.append(col)
# Check categories
# Could be expanded to all categorical columns
col = 'Mas Vnr Type'
clean_check_col(df_train,col, {'None', 'BrkCmn', 'BrkFace', 'Stone', 'CBlock'},'None')
clean_check_col(df_kaggle,col, {'None', 'BrkCmn', 'BrkFace', 'Stone', 'CBlock'},'None')
cleaned_cols.append(col)
col = 'Alley'
clean_check_col(df_train,col,{'None', 'Pave', 'Grvl'},'None')
clean_check_col(df_kaggle,col,{'None', 'Pave', 'Grvl'},'None')
cleaned_cols.append(col)
col = 'BsmtFin Type 1'
clean_check_col(df_train,'BsmtFin Type 1',{'None', 'ALQ', 'BLQ', 'GLQ', 'Rec', 'LwQ', 'Unf'},'None')
clean_check_col(df_kaggle,'BsmtFin Type 1',{'None', 'ALQ', 'BLQ', 'GLQ', 'Rec', 'LwQ', 'Unf'},'None')
cleaned_cols.append(col)
col = 'BsmtFin Type 2'
clean_check_col(df_train,col,{'None', 'ALQ', 'BLQ', 'GLQ', 'Rec', 'LwQ', 'Unf'},'None')
clean_check_col(df_kaggle,col,{'None', 'ALQ', 'BLQ', 'GLQ', 'Rec', 'LwQ', 'Unf'},'None')
cleaned_cols.append(col)
col = 'Garage Type'
clean_check_col(df_train,col,{'None', 'Detchd', 'Attchd', 'BuiltIn', 'Basment', '2Types', 'CarPort'},'None')
clean_check_col(df_kaggle,col,{'None', 'Detchd', 'Attchd', 'BuiltIn', 'Basment', '2Types', 'CarPort'},'None')
cleaned_cols.append(col)
col = 'Fence'
clean_check_col(df_train,col,{'None', 'MnWw', 'MnPrv', 'GdWo', 'GdPrv'},'None')
clean_check_col(df_kaggle,col,{'None', 'MnWw', 'MnPrv', 'GdWo', 'GdPrv'},'None')
cleaned_cols.append(col)
col = 'Electrical'
clean_check_col(df_train,col,{'SBrkr', 'FuseP', 'FuseF', 'Mix', 'FuseA','NA'},'NA')
clean_check_col(df_kaggle,col,{'SBrkr', 'FuseP', 'FuseF', 'Mix', 'FuseA','NA'},'NA')
cleaned_cols.append(col)
col = 'Misc Feature'
clean_check_col(df_train,col,{'Elev', 'Gar2', 'Othr', 'Shed', 'TenC'},'Othr')
clean_check_col(df_kaggle,col,{'Elev', 'Gar2', 'Othr', 'Shed', 'TenC'},'Othr')
cleaned_cols.append(col)
# Update Lot Frontage
# Probably should have a better way to impute the value for missing data
col = 'Lot Frontage'
lf_mean = df_train[col].mean()
df_train.loc[:, col] = df_train.loc[:, col].fillna(lf_mean)
df_kaggle.loc[:, col] = df_kaggle.loc[:, col].fillna(lf_mean)
cleaned_cols.append(col)
# Change year, SF to float - should generalize this
cols = ['Gr Liv Area', '1st Flr SF', 'Year Built', 'Year Remod/Add']
for c in cols:
df_train[c] = df_train[c].astype(float)
# Determine what columns haven't cleaned and make sure it's ok
obj_cols = set(df_train.select_dtypes([np.object]).columns)
all_cleaned_cols = set(cleaned_cols)
not_cleaned = list(set(obj_cols) - set(cleaned_cols))
print("You worked hard on the cleaning, but not hard enough. Here's what's left....\n {}".format(not_cleaned))
# Drop Utilities, MSSubClass
cols_drop = ['Utilities', 'MS SubClass', 'Garage Yr Blt','PID']
df_train.drop(cols_drop, axis=1, inplace=True)
df_kaggle.drop(cols_drop, axis=1, inplace=True)
check_data(df_train)
check_data(df_kaggle)
clean_data(df_train, 0) # or 0
clean_data(df_kaggle, 0) # or 0
check_data(df_train)
check_data(df_kaggle)
df_train.head()
# check_cols = df.select_dtypes([np.object]).columns
# df_train.select_dtypes([np.object]).columns
print("There are {} columns and {} categorial columns."
.format(df_train.shape[1],
df_train.select_dtypes([np.object]).shape[1]))
# Let's make dummies for the categorical data
df_train = pd.get_dummies(df_train)
df_kaggle = pd.get_dummies(df_kaggle)
# Get dummies could leave us with mismatched columns
# Make sure we have mathcing columns in the train and kaggle DataFrames
cols_train = df_train.columns
cols_kaggle = df_kaggle.columns
for c in cols_train:
if c not in cols_kaggle:
df_kaggle[c] = 0
for c in cols_kaggle:
if c not in cols_train:
df_train[c] = 0
columns = sorted(df_train.columns)
df_train = df_train[columns]
df_kaggle = df_kaggle[columns]
if df_train.shape[1] != df_kaggle.shape[1]:
print("Train and Kaggle don't have same # columns: {} {}".format(df_train.shape[1], df_kaggle.shape[1]))
# Having made dummies, we now have lots and lots of columns
print("There are {} columns and {} categorial columns."
.format(df_train.shape[1],
df_train.select_dtypes([np.object]).shape[1]))
# Let's find columns that are most highly correlated with price
top_corr = list(pd.concat([df_train, y], axis=1).corr()['SalePrice'].sort_values(ascending=False).index[1:16])
# Heat map of colmns with highest correlation with Sales Price
plt.figure(figsize=(10,10))
sns.heatmap(pd.concat([df_train, y],axis=1)[['SalePrice'] + top_corr].corr().abs().sort_values(by='SalePrice',ascending=False),
vmin=-1,vmax=1, cmap = colors_str, annot=True);
# Pair plot of features to see any trends
top_corr_sub = list(pd.concat([df_train, y], axis=1).corr()['SalePrice'].sort_values(ascending=False).index[1:8])
sns.pairplot(df_train[top_corr_sub]);
We can plot some of the columns to see if there are any trends in the distributions of values.
# Plots for data analysis
df_y = pd.concat([y,df_train[top_corr]],axis=1)
cols_y = ['SalePrice'] + top_corr
# Set up the matplotlib figure
f, axes = plt.subplots(5, 2, figsize=(15, 10)) #, sharex=True)
f.suptitle('Distributions of features correlated with Sales Price')
sns.despine(left=True)
plt.subplots_adjust(hspace = .5)
color = colors_str[3]
n_rows = 5
n_cols = 2
for r in range(n_rows):
for c in range(n_cols):
idx = r*2+c
ax=axes[r,c]
ax.set_title(cols_y[idx])
sns.distplot(df_y.iloc[:,idx], kde=False, color=color, ax=ax)
ax.set_xlabel('')
ax.set_ylabel('');
What are the trends? Are there any outliers?
# Set up the matplotlib figure
f, axes = plt.subplots(5, 2) #, sharex=True)
f.set_figheight(15)
f.set_figwidth(20)
f.suptitle('Comparisons')
sns.despine(left=True)
plt.subplots_adjust(hspace = .5)
color = colors_str[3]
n_rows = 5
n_cols = 2
for r in range(n_rows):
for c in range(n_cols):
idx = r*2+c
ax=axes[r,c]
ax.set_title(top_corr[idx] + " / Price")
# Scatter or box based on type
if df_train[top_corr[idx]].dtype == "float64":
sns.regplot(x=top_corr[idx], y=y, data=df_train, color=color, ax=ax)
else:
sns.boxplot(x=top_corr[idx], y=y, data=df_train, color=color, ax=ax)
sns.regplot(x=top_corr[idx], y=y, data=df_train, scatter=False, color=color, ax=ax)
ax.set_xlabel(top_corr[idx])
ax.set_ylabel('Price')
# Let's drop the outliers
df_train[y_label] = y
df_train = df_train[df_train['1st Flr SF'] <= 4000]
y = df_train[y_label]
df_train.drop(columns=y_label, inplace=True)
# Total square footage
df_train["Total SF"] = df_train["1st Flr SF"] + df_train["2nd Flr SF"] + df_train["BsmtFin SF 1"] + df_train["BsmtFin SF 2"]
df_kaggle["Total SF"] = df_kaggle["1st Flr SF"] + df_kaggle["2nd Flr SF"] + df_kaggle["BsmtFin SF 1"] + df_kaggle["BsmtFin SF 2"]
# Log of year built?
df_train["Log year"] = np.log(df_train["Year Built"])
df_kaggle["Log year"] = np.log(df_kaggle["Year Built"])
# Train the regression models on subest of columns
def model_pipeline(df_train, df_kaggle, X_cols, y, reg_type, poly=False, plot=False):
X = df_train[X_cols]
# Transofrm the prices?
transform_y = False
if transform_y:
y = np.log1p(y)
# Test/train split of "full training" data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# Set up the Pipline based on input arguments
pf = ('pf', PolynomialFeatures())
ss = ('ss', StandardScaler())
lr = ('lr', LinearRegression())
rcv = ( 'rcv', RidgeCV(alphas=np.logspace(0,5,200), cv=8))
lcv = ('lcv', LassoCV(n_alphas=500, max_iter=100000))
ecv = ('ecv', ElasticNetCV(n_alphas=100, l1_ratio=[.1, .5, .7, .9, .95, .99, 1], cv=10))
pipe_list = []
# Polynomial Features?
if poly:
pipe_list.append(pf)
# Scale the features
pipe_list.append(ss)
# Choose a regression algorithm
if reg_type == 'lr':
print("LinearRegression:")
model = lr
elif reg_type == 'rcv':
print("RidgeCV:")
model = rcv
elif reg_type == 'lcv':
print("LassoCV:")
model = lcv
else: #ElasticNet
print("ElasticNetCV:")
model = ecv
pipe_list.append(model)
# Fit the pipline to the training data
pipe = Pipeline(pipe_list)
pipe.fit(X_train,y_train)
train_score = pipe.score(X_train, y_train)
# Score the pipeline model on the test set
test_score = pipe.score(X_test, y_test)
y_test_hat = pipe.predict(X_test)
# Refit on FULL test set
pipe.fit(X,y)
full_score = pipe.score(X, y)
y_hat = pipe.predict(X)
y_hat_train = pipe.predict(X_train)
y_hat_test = pipe.predict(X_test)
print(" Train: {} Test: {} Full: {}".format(train_score, test_score, full_score))
# Predict the sales price of the Kaggle data
X_kaggle = df_kaggle[X_cols]
y_kaggle = pipe.predict(X_kaggle)
# "un" transform y
if transform_y:
y_kaggle = np.expm1(y_kaggle)
# Create dataframe for Kaggle submission
df_soln = pd.DataFrame(df_kaggle.index)
df_soln['SalePrice'] = y_kaggle
df_soln.set_index(['Id'], inplace=True)
if plot:
# Plot predictions
plt.figure(figsize=(10, 8))
plt.title("Predictions for training and test sets")
if transform_y:
sns.regplot(x = np.expm1(y_hat_train), y = np.expm1(y_train), data=X_train, color = colors_str[0], label = "Training data")
sns.regplot(x = np.expm1(y_hat_test), y = np.expm1(y_test), data=X_test, color = colors_str[5], label = "Testing data")
else:
sns.regplot(x = y_hat_train, y = y_train, data=X_train, color = colors_str[0], label = "Training data")
sns.regplot(x = y_hat_test, y = y_test, data=X_test, color = colors_str[5], label = "Testing data")
plt.xlabel("Predicted prices")
plt.ylabel("Actual sale price")
plt.legend()
return pipe, df_soln
# Feature choices - all the columns or ones highly correlated with price
X_cols = df_train.columns
X_cols_corr_16 = list(pd.concat([df_train[X_cols], y], axis=1).corr()['SalePrice'].sort_values(ascending=False).index[1:16])
X_cols_corr_32 = list(pd.concat([df_train[X_cols], y], axis=1).corr()['SalePrice'].sort_values(ascending=False).index[1:32])
# Try pipline with four types of models
reg_type = ['lr','rcv','lcv','ecv']
print("Test with 16 correlated columns and PolynomialFeatures.")
poly = True
for reg in reg_type:
pipe, df_soln = model_pipeline(df_train, df_kaggle, X_cols_corr_16, y, reg, poly)
print("")
print("Test with 32 correlated columns but not PolynomialFeatures.")
poly = False
for reg in reg_type:
pipe, df_soln = model_pipeline(df_train, df_kaggle, X_cols_corr_32, y, reg, poly)
# Test with 16 correlated columns and PolynomialFeatures.
# LinearRegression:
# Train: 0.9237060276826285 Test: 0.9104119554144706 Full: 0.9240686101171751
# RidgeCV:
# Train: 0.9082049558489937 Test: 0.9182713100564817 Full: 0.9126412891489258
# LassoCV:
# Train: 0.9075836962668186 Test: 0.9174018718403385 Full: 0.9119122701289253
# ElasticNetCV:
# Train: 0.9085340145571048 Test: 0.9176748695103848 Full: 0.9128014954661423
# Test with 32 correlated columns but not PolynomialFeatures.
# LinearRegression:
# Train: 0.887891014167355 Test: 0.8837176230519301 Full: 0.8887715167205408
# RidgeCV:
# Train: 0.8871598323113536 Test: 0.8854053254428426 Full: 0.8883266576770845
# LassoCV:
# Train: 0.8874196067096932 Test: 0.8846977558669055 Full: 0.8883275258306575
# ElasticNetCV:
# Train: 0.8874917522365515 Test: 0.8846394212576741 Full: 0.8884179846426734
# Run Lasso and Plot the residuals
poly=False
pipe, df_soln = model_pipeline(df_train, df_kaggle, X_cols_corr_32, y, "lcv", poly, True)
# LassoCV:
# Train: 0.8874196067096932 Test: 0.8846977558669055 Full: 0.8883275258306575
# Drop the columns that Lasso zeros out and run all the models again
df_lcv_coefs = pd.DataFrame({
'coefs': X_cols_corr_32,
'vals': pipe.named_steps['lcv'].coef_
}).set_index('coefs').sort_values('vals', ascending=False)
# Resample the correlated columns without the ones we are dropping
X_cols_drop = list(df_lcv_coefs[abs(df_lcv_coefs["vals"]) <= .001].index)
print("Dropping {} cols.".format(len(X_cols_drop)))
X_cols = [c for c in X_cols if c not in X_cols_drop]
X_cols_corr_16 = list(pd.concat([df_train[X_cols], y], axis=1).corr()['SalePrice'].sort_values(ascending=False).index[1:16])
X_cols_corr_32 = list(pd.concat([df_train[X_cols], y], axis=1).corr()['SalePrice'].sort_values(ascending=False).index[1:32])
print("Test with 16 correlated columns and PolynomialFeatures.")
poly = True
for reg in reg_type:
pipe, df_soln = model_pipeline(df_train, df_kaggle, X_cols_corr_16, y, reg, poly)
print("")
print("Test with 32 correlated columns but not PolynomialFeatures.")
poly = False
for reg in reg_type:
pipe, df_soln = model_pipeline(df_train, df_kaggle, X_cols_corr_32, y, reg, poly)
# Dropping 5 cols.
# Test with 16 correlated columns and PolynomialFeatures.
# LinearRegression:
# Train: 0.924656172862715 Test: 0.9102131066565704 Full: 0.9247377205942611
# RidgeCV:
# Train: 0.9126125651209006 Test: 0.9226946127601622 Full: 0.9161536786862323
# LassoCV:
# Train: 0.9007090632975836 Test: 0.9168593529560066 Full: 0.915073748596963
# ElasticNetCV:
# Train: 0.9124750143765084 Test: 0.9212219388967573 Full: 0.9162140013589439
# Test with 32 correlated columns but not PolynomialFeatures.
# LinearRegression:
# Train: 0.8878501960725624 Test: 0.8846577925136798 Full: 0.8887389940166835
# RidgeCV:
# Train: 0.8875644474569013 Test: 0.8860700482417241 Full: 0.8884603811274605
# LassoCV:
# Train: 0.887497488209082 Test: 0.8850722158098991 Full: 0.8886017100179591
# ElasticNetCV:
# Train: 0.887582287156061 Test: 0.8850357397814944 Full: 0.8886518360157969
# Run Lasso again
poly=False
pipe, df_soln = model_pipeline(df_train, df_kaggle, X_cols_corr_32, y, "lcv", poly)
df_lcv_coefs = pd.DataFrame({
'coefs': X_cols_corr_32,
'vals': pipe.named_steps['lcv'].coef_
}).set_index('coefs').sort_values('vals', ascending=False)
# Resample the correlated columns without the ones we are dropping
X_cols_drop = list(df_lcv_coefs[abs(df_lcv_coefs["vals"]) <= .001].index)
print("Dropping {} cols.".format(len(X_cols_drop)))
X_cols = [c for c in X_cols if c not in X_cols_drop]
X_cols_corr_16 = list(pd.concat([df_train[X_cols], y], axis=1).corr()['SalePrice'].sort_values(ascending=False).index[1:16])
X_cols_corr_32 = list(pd.concat([df_train[X_cols], y], axis=1).corr()['SalePrice'].sort_values(ascending=False).index[1:32])
# LassoCV:
# Train: 0.887497488209082 Test: 0.8850722158098991 Full: 0.8886017100179591
# Dropping 2 cols.
# Run one really big model now.
print("Test with 32 columns, LassoCV, and Polynomial Features.")
poly = True
pipe, df_soln = model_pipeline(df_train, df_kaggle, X_cols_corr_32, y, "lcv", poly, True)
# Test with 32 columns, LassoCV, and Polynomial Features.
# LassoCV:
# Train: 0.9295135127811229 Test: 0.9280722123044919 Full: 0.9303993065460205
print("The columns used were: {}.".format(X_cols_corr_32))
# Write predicted Kaggle solution out to a file
now = str(datetime.datetime.now())
f'predictions_{now}'
df_soln.to_csv(f'../data/model_preds_{now}.csv')
For some reason, my Kaggle scores were never very good with this model. I am not sure why this was the case. I should investigate this further, myabe by entering the public Kaggle competion for the Ames Housing data problem.
The best model I had used 32 columns, PolynomialFeatures, and LassoCV. The columns were:
['Overall Qual', 'Total SF', 'Gr Liv Area', 'Exter Qual', 'Kitchen Qual', 'Total Bsmt SF', 'Garage Area', '1st Flr SF', 'Log year', 'Year Remod/Add', 'Full Bath', 'Foundation_PConc', 'Mas Vnr Area', 'Fireplaces', 'BsmtFin Type 1_GLQ', 'Heating QC', 'Neighborhood_NridgHt', 'BsmtFin SF 1', 'Sale Type_New', 'Garage Type_Attchd', 'Lot Frontage', 'Open Porch SF', 'Wood Deck SF', 'Mas Vnr Type_Stone', 'Lot Area', 'Lot Shape_IR1', 'Roof Style_Hip', 'Neighborhood_NoRidge', 'Mas Vnr Type_BrkFace', 'Neighborhood_StoneBr', 'Electrical_SBrkr']