import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


bike_df_clean = pd.read_csv("bike_rental_clean.csv")
bike_df_clean.head(10)


bike_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      731 non-null    int64  
 1   yr          731 non-null    int64  
 2   mnth        731 non-null    int64  
 3   holiday     731 non-null    int64  
 4   weathersit  731 non-null    int64  
 5   temp        731 non-null    float64
 6   hum         731 non-null    float64
 7   windspeed   731 non-null    float64
 8   cnt         731 non-null    int64  
dtypes: float64(3), int64(6)
memory usage: 51.5 KB


# Convert mnth, yr, holiday, weekday, workingday and weathersit to categorical variables

bike_df_clean['season'] = bike_df_clean['season'].astype('category')
bike_df_clean['mnth'] = bike_df_clean['mnth'].astype('category')
bike_df_clean['yr'] = bike_df_clean['yr'].astype('category')
bike_df_clean['holiday'] = bike_df_clean['holiday'].astype('category')
bike_df_clean['weathersit'] = bike_df_clean['weathersit'].astype('category')


bike_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   season      731 non-null    category
 1   yr          731 non-null    category
 2   mnth        731 non-null    category
 3   holiday     731 non-null    category
 4   weathersit  731 non-null    category
 5   temp        731 non-null    float64 
 6   hum         731 non-null    float64 
 7   windspeed   731 non-null    float64 
 8   cnt         731 non-null    int64   
dtypes: category(5), float64(3), int64(1)
memory usage: 27.5 KB


# Separate target

features_name = bike_df_clean.columns[:8]
X = bike_df_clean[features_name]
y = bike_df_clean["cnt"]
print("The features are", X.columns)
print("The target variable is", y.name)

The features are Index(['season', 'yr', 'mnth', 'holiday', 'weathersit', 'temp', 'hum',
       'windspeed'],
      dtype='object')
The target variable is cnt


#Get train and test data using the same random_state value
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=123)
print('Training dataset: X_train=', X_train.shape, ', y_train', y_train.shape)
print('Testing dataset: X_test=', X_test.shape, ', y_test', y_test.shape)

Training dataset: X_train= (548, 8) , y_train (548,)
Testing dataset: X_test= (183, 8) , y_test (183,)


from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV


# Setup the pipeline
pipe_polyr = Pipeline([('scaler', StandardScaler()),
                       ('poly', PolynomialFeatures()),
                       ('linear', LinearRegression())
                       ])

# Fit the model
pipe_polyr.fit(X_train, y_train)
# predict using model
y_train_polyr = pipe_polyr.predict(X_train)
y_pred_polyr = pipe_polyr.predict(X_test)

# Get scores
MSE_polyr_train = mean_squared_error(y_train, y_train_polyr)
MSE_polyr_test = mean_squared_error(y_test, y_pred_polyr)
RMSE_polyr_train = np.sqrt(mean_squared_error(y_train, y_train_polyr))
RMSE_polyr_test = np.sqrt(mean_squared_error(y_test, y_pred_polyr))
print('Polynomial Regression MSE train:', round(MSE_polyr_train, 2))
print('Polynomial Regression MSE test:', round(MSE_polyr_test, 2))
print('Polynomial Regression RMSE train:', round(RMSE_polyr_train, 2))
print('Polynomial Regression RMSE test:', round(RMSE_polyr_test, 2))

Polynomial Regression MSE train: 448634.32
Polynomial Regression MSE test: 479039.91
Polynomial Regression RMSE train: 669.8
Polynomial Regression RMSE test: 692.13


coeff = pipe_polyr.named_steps['linear'].coef_
print(list(zip(coeff, features_name)))

[(-24115144773.315323, 'season'), (169.34231277360865, 'yr'), (-854260142813.9939, 'mnth'), (383.68708888068795, 'holiday'), (1876942697980.039, 'weathersit'), (-142.00546350708385, 'temp'), (849.6071533818828, 'hum'), (-365.84366104206487, 'windspeed')]


pipe_polyr.named_steps['poly'].degree

2


# Setup the pipeline
pipe_polyr = Pipeline([('scaler', StandardScaler()),
                       ('pca', PCA()),
                       ('poly', PolynomialFeatures()),
                       ('linear', LinearRegression())
                       ])

# Setup hyperparameters to search
params = {'poly__degree': [1, 2, 3, 4, 5],
          'poly__interaction_only':[False,True],
          'scaler': [StandardScaler(), RobustScaler(), None],
          'pca': [PCA(2),PCA(3),PCA(4), PCA(5), PCA(6), None]
          }

# Perform grid search for best parameters
polyr = GridSearchCV(pipe_polyr, param_grid=params, cv=5,
                     scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=1)


# Fit the model
polyr.fit(X_train, y_train)


# Get scores
print("Best Cross-Validation Accuracy: {:.2f}".format(polyr.best_score_*-1))
print("Training RMSE: {:.2f}".format(polyr.score(X_train, y_train)*-1))
print("Testing RMSE: {:.2f}".format(polyr.score(X_test, y_test)*-1))
print("Training MSE: {:.2f}".format((polyr.score(X_train, y_train)*-1)**2))
print("Testing MSE: {:.2f}".format((polyr.score(X_test, y_test)*-1)**2))
print("Best Hyperparameters: {}".format(polyr.best_params_), "\n")

Fitting 5 folds for each of 180 candidates, totalling 900 fits
Best Cross-Validation Accuracy: 681.90
Training RMSE: 572.56
Testing RMSE: 668.46
Training MSE: 327830.24
Testing MSE: 446838.94
Best Hyperparameters: {'pca': PCA(n_components=5), 'poly__degree': 3, 'poly__interaction_only': False, 'scaler': RobustScaler()}


pd.DataFrame(polyr.cv_results_).sort_values('rank_test_score').head(10)


# Predict Values
y_pred_polyr = polyr.predict(X_test)

# Plot Predicted vs Actual
plt.figure(figsize=(12,10))
sns.scatterplot(x=y_test, y=y_pred_polyr, color = "darkorange")
p1 = max(max(y_pred_polyr), max(y_test))
p2 = min(min(y_pred_polyr), min(y_test))
plt.plot([p1, p2], [p1, p2], '-', color = "blueviolet")
plt.xlim(0,10000)
plt.xlabel('Actual')
plt.ylabel('Predictions')
plt.title('Predicted vs Actual (Polynomial Regression)', fontsize=16)
plt.show()

	season	mnth	weathersit	temp	hum	windspeed	cnt
0	1	1	2	0.344167	0.805833	0.160446	985
1	1	1	2	0.363478	0.696087	0.248539	801
2	1	1	1	0.196364	0.437273	0.248309	1349
3	1	1	1	0.200000	0.590435	0.160296	1562
4	1	1	1	0.226957	0.436957	0.186900	1600
5	1	1	1	0.204348	0.518261	0.089565	1606
6	1	1	2	0.196522	0.498696	0.168726	1510
7	1	1	2	0.165000	0.535833	0.266804	959
8	1	1	1	0.138333	0.434167	0.361950	822
9	1	1	1	0.150833	0.482917	0.223267	1321

	mean_fit_time	std_fit_time	mean_score_time	std_score_time	param_pca	param_poly__degree	param_poly__interaction_only	param_scaler	params	split0_test_score	split1_test_score	split2_test_score	split3_test_score	split4_test_score	mean_test_score	std_test_score	rank_test_score
103	0.023021	0.018423	0.002803	0.000749	PCA(n_components=5)	3	False	RobustScaler()	{'pca': PCA(n_components=5), 'poly__degree': 3...	-699.422417	-668.468640	-650.357674	-692.395412	-698.856760	-681.900181	19.390927	1
136	0.008007	0.000633	0.018216	0.019546	PCA(n_components=6)	3	True	RobustScaler()	{'pca': PCA(n_components=6), 'poly__degree': 3...	-823.873378	-672.741833	-818.249687	-793.157706	-765.798005	-774.764122	55.009047	2
106	0.016015	0.016027	0.002202	0.000400	PCA(n_components=5)	3	True	RobustScaler()	{'pca': PCA(n_components=5), 'poly__degree': 3...	-847.425420	-709.427975	-811.614241	-829.333999	-754.314036	-790.423134	51.139929	3
142	0.024222	0.019526	0.004404	0.002335	PCA(n_components=6)	4	True	RobustScaler()	{'pca': PCA(n_components=6), 'poly__degree': 4...	-867.414479	-683.230718	-847.189281	-879.867974	-725.875431	-800.715577	80.345990	4
112	0.020419	0.014007	0.015414	0.022377	PCA(n_components=5)	4	True	RobustScaler()	{'pca': PCA(n_components=5), 'poly__degree': 4...	-854.849890	-732.886398	-800.242036	-846.899536	-771.127344	-801.201041	45.910323	5
127	0.027024	0.023482	0.002603	0.000801	PCA(n_components=6)	2	False	RobustScaler()	{'pca': PCA(n_components=6), 'poly__degree': 2...	-787.527862	-675.306735	-796.957585	-756.668067	-1004.140846	-804.120219	108.792905	6
97	0.018817	0.015211	0.002602	0.000801	PCA(n_components=5)	2	False	RobustScaler()	{'pca': PCA(n_components=5), 'poly__degree': 2...	-824.427024	-716.005051	-833.077474	-805.084285	-848.385208	-805.395808	46.834506	7
130	0.009208	0.002484	0.024222	0.030434	PCA(n_components=6)	2	True	RobustScaler()	{'pca': PCA(n_components=6), 'poly__degree': 2...	-826.358997	-686.357752	-868.215475	-836.073438	-820.847504	-807.570633	62.788609	8
100	0.015614	0.015732	0.002802	0.001601	PCA(n_components=5)	2	True	RobustScaler()	{'pca': PCA(n_components=5), 'poly__degree': 2...	-861.152297	-739.166501	-876.496525	-849.084464	-816.045648	-828.389087	48.844821	9
126	0.019417	0.024333	0.002402	0.000490	PCA(n_components=6)	2	False	StandardScaler()	{'pca': PCA(n_components=6), 'poly__degree': 2...	-850.578841	-784.054450	-863.420044	-791.731181	-870.073131	-831.971529	36.612308	10

Polynomial Regression¶

Load the data¶

Data Preparation¶

Data Segregation¶

Model Evaluation¶

Polynomial Regression without hyperparameter tuning¶

Polynomial Regression with hyperparameter tuning¶

Observations¶