import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
bike_df_clean = pd.read_csv("bike_rental_clean.csv")
bike_df_clean.head(10)
season | yr | mnth | holiday | weathersit | temp | hum | windspeed | cnt | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 1 | 0 | 2 | 0.344167 | 0.805833 | 0.160446 | 985 |
1 | 1 | 0 | 1 | 0 | 2 | 0.363478 | 0.696087 | 0.248539 | 801 |
2 | 1 | 0 | 1 | 0 | 1 | 0.196364 | 0.437273 | 0.248309 | 1349 |
3 | 1 | 0 | 1 | 0 | 1 | 0.200000 | 0.590435 | 0.160296 | 1562 |
4 | 1 | 0 | 1 | 0 | 1 | 0.226957 | 0.436957 | 0.186900 | 1600 |
5 | 1 | 0 | 1 | 0 | 1 | 0.204348 | 0.518261 | 0.089565 | 1606 |
6 | 1 | 0 | 1 | 0 | 2 | 0.196522 | 0.498696 | 0.168726 | 1510 |
7 | 1 | 0 | 1 | 0 | 2 | 0.165000 | 0.535833 | 0.266804 | 959 |
8 | 1 | 0 | 1 | 0 | 1 | 0.138333 | 0.434167 | 0.361950 | 822 |
9 | 1 | 0 | 1 | 0 | 1 | 0.150833 | 0.482917 | 0.223267 | 1321 |
bike_df_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 731 entries, 0 to 730 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 season 731 non-null int64 1 yr 731 non-null int64 2 mnth 731 non-null int64 3 holiday 731 non-null int64 4 weathersit 731 non-null int64 5 temp 731 non-null float64 6 hum 731 non-null float64 7 windspeed 731 non-null float64 8 cnt 731 non-null int64 dtypes: float64(3), int64(6) memory usage: 51.5 KB
# Convert mnth, yr, holiday, weekday, workingday and weathersit to categorical variables
bike_df_clean['season'] = bike_df_clean['season'].astype('category')
bike_df_clean['mnth'] = bike_df_clean['mnth'].astype('category')
bike_df_clean['yr'] = bike_df_clean['yr'].astype('category')
bike_df_clean['holiday'] = bike_df_clean['holiday'].astype('category')
bike_df_clean['weathersit'] = bike_df_clean['weathersit'].astype('category')
bike_df_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 731 entries, 0 to 730 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 season 731 non-null category 1 yr 731 non-null category 2 mnth 731 non-null category 3 holiday 731 non-null category 4 weathersit 731 non-null category 5 temp 731 non-null float64 6 hum 731 non-null float64 7 windspeed 731 non-null float64 8 cnt 731 non-null int64 dtypes: category(5), float64(3), int64(1) memory usage: 27.5 KB
*Note: a PCA was performed in the model evaluation section when generating the pipeline
# Separate target
features_name = bike_df_clean.columns[:8]
X = bike_df_clean[features_name]
y = bike_df_clean["cnt"]
print("The features are", X.columns)
print("The target variable is", y.name)
The features are Index(['season', 'yr', 'mnth', 'holiday', 'weathersit', 'temp', 'hum', 'windspeed'], dtype='object') The target variable is cnt
#Get train and test data using the same random_state value
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=123)
print('Training dataset: X_train=', X_train.shape, ', y_train', y_train.shape)
print('Testing dataset: X_test=', X_test.shape, ', y_test', y_test.shape)
Training dataset: X_train= (548, 8) , y_train (548,) Testing dataset: X_test= (183, 8) , y_test (183,)
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
# Setup the pipeline
pipe_polyr = Pipeline([('scaler', StandardScaler()),
('poly', PolynomialFeatures()),
('linear', LinearRegression())
])
# Fit the model
pipe_polyr.fit(X_train, y_train)
# predict using model
y_train_polyr = pipe_polyr.predict(X_train)
y_pred_polyr = pipe_polyr.predict(X_test)
# Get scores
MSE_polyr_train = mean_squared_error(y_train, y_train_polyr)
MSE_polyr_test = mean_squared_error(y_test, y_pred_polyr)
RMSE_polyr_train = np.sqrt(mean_squared_error(y_train, y_train_polyr))
RMSE_polyr_test = np.sqrt(mean_squared_error(y_test, y_pred_polyr))
print('Polynomial Regression MSE train:', round(MSE_polyr_train, 2))
print('Polynomial Regression MSE test:', round(MSE_polyr_test, 2))
print('Polynomial Regression RMSE train:', round(RMSE_polyr_train, 2))
print('Polynomial Regression RMSE test:', round(RMSE_polyr_test, 2))
Polynomial Regression MSE train: 448634.32 Polynomial Regression MSE test: 479039.91 Polynomial Regression RMSE train: 669.8 Polynomial Regression RMSE test: 692.13
coeff = pipe_polyr.named_steps['linear'].coef_
print(list(zip(coeff, features_name)))
[(-24115144773.315323, 'season'), (169.34231277360865, 'yr'), (-854260142813.9939, 'mnth'), (383.68708888068795, 'holiday'), (1876942697980.039, 'weathersit'), (-142.00546350708385, 'temp'), (849.6071533818828, 'hum'), (-365.84366104206487, 'windspeed')]
pipe_polyr.named_steps['poly'].degree
2
# Setup the pipeline
pipe_polyr = Pipeline([('scaler', StandardScaler()),
('pca', PCA()),
('poly', PolynomialFeatures()),
('linear', LinearRegression())
])
# Setup hyperparameters to search
params = {'poly__degree': [1, 2, 3, 4, 5],
'poly__interaction_only':[False,True],
'scaler': [StandardScaler(), RobustScaler(), None],
'pca': [PCA(2),PCA(3),PCA(4), PCA(5), PCA(6), None]
}
# Perform grid search for best parameters
polyr = GridSearchCV(pipe_polyr, param_grid=params, cv=5,
scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=1)
# Fit the model
polyr.fit(X_train, y_train)
# Get scores
print("Best Cross-Validation Accuracy: {:.2f}".format(polyr.best_score_*-1))
print("Training RMSE: {:.2f}".format(polyr.score(X_train, y_train)*-1))
print("Testing RMSE: {:.2f}".format(polyr.score(X_test, y_test)*-1))
print("Training MSE: {:.2f}".format((polyr.score(X_train, y_train)*-1)**2))
print("Testing MSE: {:.2f}".format((polyr.score(X_test, y_test)*-1)**2))
print("Best Hyperparameters: {}".format(polyr.best_params_), "\n")
Fitting 5 folds for each of 180 candidates, totalling 900 fits Best Cross-Validation Accuracy: 681.90 Training RMSE: 572.56 Testing RMSE: 668.46 Training MSE: 327830.24 Testing MSE: 446838.94 Best Hyperparameters: {'pca': PCA(n_components=5), 'poly__degree': 3, 'poly__interaction_only': False, 'scaler': RobustScaler()}
pd.DataFrame(polyr.cv_results_).sort_values('rank_test_score').head(10)
mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_pca | param_poly__degree | param_poly__interaction_only | param_scaler | params | split0_test_score | split1_test_score | split2_test_score | split3_test_score | split4_test_score | mean_test_score | std_test_score | rank_test_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
103 | 0.023021 | 0.018423 | 0.002803 | 0.000749 | PCA(n_components=5) | 3 | False | RobustScaler() | {'pca': PCA(n_components=5), 'poly__degree': 3... | -699.422417 | -668.468640 | -650.357674 | -692.395412 | -698.856760 | -681.900181 | 19.390927 | 1 |
136 | 0.008007 | 0.000633 | 0.018216 | 0.019546 | PCA(n_components=6) | 3 | True | RobustScaler() | {'pca': PCA(n_components=6), 'poly__degree': 3... | -823.873378 | -672.741833 | -818.249687 | -793.157706 | -765.798005 | -774.764122 | 55.009047 | 2 |
106 | 0.016015 | 0.016027 | 0.002202 | 0.000400 | PCA(n_components=5) | 3 | True | RobustScaler() | {'pca': PCA(n_components=5), 'poly__degree': 3... | -847.425420 | -709.427975 | -811.614241 | -829.333999 | -754.314036 | -790.423134 | 51.139929 | 3 |
142 | 0.024222 | 0.019526 | 0.004404 | 0.002335 | PCA(n_components=6) | 4 | True | RobustScaler() | {'pca': PCA(n_components=6), 'poly__degree': 4... | -867.414479 | -683.230718 | -847.189281 | -879.867974 | -725.875431 | -800.715577 | 80.345990 | 4 |
112 | 0.020419 | 0.014007 | 0.015414 | 0.022377 | PCA(n_components=5) | 4 | True | RobustScaler() | {'pca': PCA(n_components=5), 'poly__degree': 4... | -854.849890 | -732.886398 | -800.242036 | -846.899536 | -771.127344 | -801.201041 | 45.910323 | 5 |
127 | 0.027024 | 0.023482 | 0.002603 | 0.000801 | PCA(n_components=6) | 2 | False | RobustScaler() | {'pca': PCA(n_components=6), 'poly__degree': 2... | -787.527862 | -675.306735 | -796.957585 | -756.668067 | -1004.140846 | -804.120219 | 108.792905 | 6 |
97 | 0.018817 | 0.015211 | 0.002602 | 0.000801 | PCA(n_components=5) | 2 | False | RobustScaler() | {'pca': PCA(n_components=5), 'poly__degree': 2... | -824.427024 | -716.005051 | -833.077474 | -805.084285 | -848.385208 | -805.395808 | 46.834506 | 7 |
130 | 0.009208 | 0.002484 | 0.024222 | 0.030434 | PCA(n_components=6) | 2 | True | RobustScaler() | {'pca': PCA(n_components=6), 'poly__degree': 2... | -826.358997 | -686.357752 | -868.215475 | -836.073438 | -820.847504 | -807.570633 | 62.788609 | 8 |
100 | 0.015614 | 0.015732 | 0.002802 | 0.001601 | PCA(n_components=5) | 2 | True | RobustScaler() | {'pca': PCA(n_components=5), 'poly__degree': 2... | -861.152297 | -739.166501 | -876.496525 | -849.084464 | -816.045648 | -828.389087 | 48.844821 | 9 |
126 | 0.019417 | 0.024333 | 0.002402 | 0.000490 | PCA(n_components=6) | 2 | False | StandardScaler() | {'pca': PCA(n_components=6), 'poly__degree': 2... | -850.578841 | -784.054450 | -863.420044 | -791.731181 | -870.073131 | -831.971529 | 36.612308 | 10 |
# Predict Values
y_pred_polyr = polyr.predict(X_test)
# Plot Predicted vs Actual
plt.figure(figsize=(12,10))
sns.scatterplot(x=y_test, y=y_pred_polyr, color = "darkorange")
p1 = max(max(y_pred_polyr), max(y_test))
p2 = min(min(y_pred_polyr), min(y_test))
plt.plot([p1, p2], [p1, p2], '-', color = "blueviolet")
plt.xlim(0,10000)
plt.xlabel('Actual')
plt.ylabel('Predictions')
plt.title('Predicted vs Actual (Polynomial Regression)', fontsize=16)
plt.show()
Both polynomial regressions had a better performance than the Naive and Dummy baseline. The model with hyperparameter tuning had a better performance overall. Similarly to the linear regression, the coefficients of the polynomial regression indicate their relationship to the target variable.