import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
bike_df_clean = pd.read_csv("bike_rental_clean.csv")
bike_df_clean.head(10)
season | yr | mnth | holiday | weathersit | temp | hum | windspeed | cnt | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 1 | 0 | 2 | 0.344167 | 0.805833 | 0.160446 | 985 |
1 | 1 | 0 | 1 | 0 | 2 | 0.363478 | 0.696087 | 0.248539 | 801 |
2 | 1 | 0 | 1 | 0 | 1 | 0.196364 | 0.437273 | 0.248309 | 1349 |
3 | 1 | 0 | 1 | 0 | 1 | 0.200000 | 0.590435 | 0.160296 | 1562 |
4 | 1 | 0 | 1 | 0 | 1 | 0.226957 | 0.436957 | 0.186900 | 1600 |
5 | 1 | 0 | 1 | 0 | 1 | 0.204348 | 0.518261 | 0.089565 | 1606 |
6 | 1 | 0 | 1 | 0 | 2 | 0.196522 | 0.498696 | 0.168726 | 1510 |
7 | 1 | 0 | 1 | 0 | 2 | 0.165000 | 0.535833 | 0.266804 | 959 |
8 | 1 | 0 | 1 | 0 | 1 | 0.138333 | 0.434167 | 0.361950 | 822 |
9 | 1 | 0 | 1 | 0 | 1 | 0.150833 | 0.482917 | 0.223267 | 1321 |
bike_df_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 731 entries, 0 to 730 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 season 731 non-null int64 1 yr 731 non-null int64 2 mnth 731 non-null int64 3 holiday 731 non-null int64 4 weathersit 731 non-null int64 5 temp 731 non-null float64 6 hum 731 non-null float64 7 windspeed 731 non-null float64 8 cnt 731 non-null int64 dtypes: float64(3), int64(6) memory usage: 51.5 KB
# Convert mnth, yr, holiday, weekday, workingday and weathersit to categorical variables
bike_df_clean['season'] = bike_df_clean['season'].astype('category')
bike_df_clean['mnth'] = bike_df_clean['mnth'].astype('category')
bike_df_clean['yr'] = bike_df_clean['yr'].astype('category')
bike_df_clean['holiday'] = bike_df_clean['holiday'].astype('category')
bike_df_clean['weathersit'] = bike_df_clean['weathersit'].astype('category')
bike_df_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 731 entries, 0 to 730 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 season 731 non-null category 1 yr 731 non-null category 2 mnth 731 non-null category 3 holiday 731 non-null category 4 weathersit 731 non-null category 5 temp 731 non-null float64 6 hum 731 non-null float64 7 windspeed 731 non-null float64 8 cnt 731 non-null int64 dtypes: category(5), float64(3), int64(1) memory usage: 27.5 KB
*Note: a PCA was performed in the model evaluation section when generating the pipeline
# Separate target
features_name = bike_df_clean.columns[:8]
X = bike_df_clean[features_name]
y = bike_df_clean["cnt"]
print("The features are", X.columns)
print("The target variable is", y.name)
The features are Index(['season', 'yr', 'mnth', 'holiday', 'weathersit', 'temp', 'hum', 'windspeed'], dtype='object') The target variable is cnt
#Get train and test data using the same random_state value
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=123)
print('Training dataset: X_train=', X_train.shape, ', y_train', y_train.shape)
print('Testing dataset: X_test=', X_test.shape, ', y_test', y_test.shape)
Training dataset: X_train= (548, 8) , y_train (548,) Testing dataset: X_test= (183, 8) , y_test (183,)
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
# Setup the pipeline
pipe_svr = Pipeline([('scaler', StandardScaler()),
('svr', SVR(kernel='linear', C=100, gamma='auto')),
])
# Fit the model
pipe_svr.fit(X_train, y_train)
# predict using model
y_train_svr = pipe_svr.predict(X_train)
y_pred_svr = pipe_svr.predict(X_test)
# Get scores
MSE_svr_train = mean_squared_error(y_train, y_train_svr)
MSE_svr_test = mean_squared_error(y_test, y_pred_svr)
RMSE_svr_train = np.sqrt(mean_squared_error(y_train, y_train_svr))
RMSE_svr_test = np.sqrt(mean_squared_error(y_test, y_pred_svr))
print('SVR Regression RMSE train:', round(RMSE_svr_train, 2))
print('SVR Regression RMSE test:', round(RMSE_svr_test, 2))
print('SVR Regression MSE train:', round(MSE_svr_train, 2))
print('SVR Regression MSE test:', round(MSE_svr_test, 2))
SVR Regression RMSE train: 877.17 SVR Regression RMSE test: 932.4 SVR Regression MSE train: 769432.66 SVR Regression MSE test: 869377.36
### Setup the pipeline
pipe_svr = Pipeline([('scaler', RobustScaler()),
('pca', PCA()),
('svr', SVR())
])
# Setup hyperparameters to search
params = {'svr__kernel': ['linear', 'rbf','sigmoid','poly'],
'scaler': [StandardScaler(), RobustScaler(), None],
'svr__C': [10, 100, 1000, 2000],
'svr__epsilon': [0.01, 0.1, 1.0],
'svr__gamma': ['scale', 'auto'],
'pca': [PCA(2), PCA(3), PCA(4), PCA(5),None]
}
# Perform grid search for best parameters
svr = GridSearchCV(pipe_svr, param_grid=params, cv=3,
scoring='neg_root_mean_squared_error', n_jobs=-1)
# Fit the model and print the best hyperparameters
svr.fit(X_train, y_train)
print("Best Cross-Validation Accuracy: {:.2f}".format(svr.best_score_*-1))
print("Training RMSE: {:.2f}".format(svr.score(X_train, y_train)*-1))
print("Test RMSE: {:.2f}".format(svr.score(X_test, y_test)*-1))
print("Training MSE: {:.2f}".format((svr.score(X_train, y_train)*-1)**2))
print("Test MSE: {:.2f}".format((svr.score(X_test, y_test)*-1)**2))
print("Best Hyperparameters: {}".format(svr.best_params_), "\n")
Best Cross-Validation Accuracy: 704.44 Training RMSE: 543.05 Test RMSE: 657.99 Training MSE: 294901.50 Test MSE: 432945.99 Best Hyperparameters: {'pca': None, 'scaler': RobustScaler(), 'svr__C': 2000, 'svr__epsilon': 1.0, 'svr__gamma': 'scale', 'svr__kernel': 'rbf'}
# Predict Values
y_pred_svr = svr.predict(X_test)
# Plot Predicted vs Actual
plt.figure(figsize=(12,10))
sns.scatterplot(x=y_test, y=y_pred_svr, color = "darkorange")
p1 = max(max(y_pred_svr), max(y_test))
p2 = min(min(y_pred_svr), min(y_test))
plt.plot([p1, p2], [p1, p2], '-', color = "blueviolet")
plt.xlim(0,10000)
plt.xlabel('Actual')
plt.ylabel('Predictions')
plt.title('Predicted vs Actual (Support Vector Machine Regression)', fontsize=16)
plt.show()
Both Support Vector Machine Regression models had a better performance than the Naive and Dummy baseline. The model with hyperparameter tuning had the highest performance of all the models. This means that this is the model that predicted with a highest robustness the number of bike rentals.
# Save the model
import pickle
filename = 'SupportVectorRegression_tuned.sav'
pickle.dump(svr, open(filename, 'wb'))