import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
bike_df_clean = pd.read_csv("bike_rental_clean.csv")
bike_df_clean.head(10)
season | yr | mnth | holiday | weathersit | temp | hum | windspeed | cnt | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 1 | 0 | 2 | 0.344167 | 0.805833 | 0.160446 | 985 |
1 | 1 | 0 | 1 | 0 | 2 | 0.363478 | 0.696087 | 0.248539 | 801 |
2 | 1 | 0 | 1 | 0 | 1 | 0.196364 | 0.437273 | 0.248309 | 1349 |
3 | 1 | 0 | 1 | 0 | 1 | 0.200000 | 0.590435 | 0.160296 | 1562 |
4 | 1 | 0 | 1 | 0 | 1 | 0.226957 | 0.436957 | 0.186900 | 1600 |
5 | 1 | 0 | 1 | 0 | 1 | 0.204348 | 0.518261 | 0.089565 | 1606 |
6 | 1 | 0 | 1 | 0 | 2 | 0.196522 | 0.498696 | 0.168726 | 1510 |
7 | 1 | 0 | 1 | 0 | 2 | 0.165000 | 0.535833 | 0.266804 | 959 |
8 | 1 | 0 | 1 | 0 | 1 | 0.138333 | 0.434167 | 0.361950 | 822 |
9 | 1 | 0 | 1 | 0 | 1 | 0.150833 | 0.482917 | 0.223267 | 1321 |
bike_df_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 731 entries, 0 to 730 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 season 731 non-null int64 1 yr 731 non-null int64 2 mnth 731 non-null int64 3 holiday 731 non-null int64 4 weathersit 731 non-null int64 5 temp 731 non-null float64 6 hum 731 non-null float64 7 windspeed 731 non-null float64 8 cnt 731 non-null int64 dtypes: float64(3), int64(6) memory usage: 51.5 KB
# Convert mnth, yr, holiday, weekday, workingday and weathersit to categorical variables
bike_df_clean['season'] = bike_df_clean['season'].astype('category')
bike_df_clean['mnth'] = bike_df_clean['mnth'].astype('category')
bike_df_clean['yr'] = bike_df_clean['yr'].astype('category')
bike_df_clean['holiday'] = bike_df_clean['holiday'].astype('category')
bike_df_clean['weathersit'] = bike_df_clean['weathersit'].astype('category')
bike_df_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 731 entries, 0 to 730 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 season 731 non-null category 1 yr 731 non-null category 2 mnth 731 non-null category 3 holiday 731 non-null category 4 weathersit 731 non-null category 5 temp 731 non-null float64 6 hum 731 non-null float64 7 windspeed 731 non-null float64 8 cnt 731 non-null int64 dtypes: category(5), float64(3), int64(1) memory usage: 27.5 KB
# Separate target
features_name = bike_df_clean.columns[:8]
X = bike_df_clean[features_name]
y = bike_df_clean["cnt"]
print("The features are", X.columns)
print("The target variable is", y.name)
The features are Index(['season', 'yr', 'mnth', 'holiday', 'weathersit', 'temp', 'hum', 'windspeed'], dtype='object') The target variable is cnt
#Get train and test data using the same random_state value
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=123)
print('Training dataset: X_train=', X_train.shape, ', y_train', y_train.shape)
print('Testing dataset: X_test=', X_test.shape, ', y_test', y_test.shape)
Training dataset: X_train= (548, 8) , y_train (548,) Testing dataset: X_test= (183, 8) , y_test (183,)
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
rfr=RandomForestRegressor(n_estimators=1000, random_state=123)
# Setup the pipeline
pipe_rfr = Pipeline([('scaler', StandardScaler()),
('rfc', RandomForestRegressor(n_estimators=1000, random_state=0))
])
# Fit the model
pipe_rfr.fit(X_train, y_train)
# predict using model
y_pred_rfr_train = pipe_rfr.predict(X_train)
y_pred_rfr_test = pipe_rfr.predict(X_test)
# Get scores
RMSE_rfr_train = np.sqrt(mean_squared_error(y_train, y_pred_rfr_train))
RMSE_rfr_test = np.sqrt(mean_squared_error(y_test, y_pred_rfr_test))
MSE_rfr_train = mean_squared_error(y_train, y_pred_rfr_train)
MSE_rfr_test = mean_squared_error(y_test, y_pred_rfr_test)
print('Random Forest RMSE training set:', round(RMSE_rfr_train, 2))
print('Random Forest RMSE testing set:', round(RMSE_rfr_test, 2))
print('Random Forest MSE training set:', round(MSE_rfr_train, 2))
print('Random Forest MSE testing set:', round(MSE_rfr_test, 2))
Random Forest RMSE training set: 250.16 Random Forest RMSE testing set: 754.92 Random Forest MSE training set: 62581.65 Random Forest MSE testing set: 569902.11
# Setup the pipeline
pipe_rfr = Pipeline([('scaler', RobustScaler()),
('rfc', RandomForestRegressor())
])
# Determine parameters
params = {
'scaler': [RobustScaler(), StandardScaler()],
'rfc__n_estimators': [200,250, 300, 350, 400, 450, 500, 550, 600],
'rfc__max_features': ['auto', 'sqrt', 'log2'],
'rfc__max_depth' : [4,5,6,7,8,9,10,11,12,13,14,15]
}
# Perform grid search for best parameters
rfr_hyper = GridSearchCV(pipe_rfr, param_grid=params, cv=5,
scoring='neg_root_mean_squared_error', n_jobs=-1, return_train_score = True)
# Fit the model and print the best hyperparameters
rfr_hyper.fit(X_train, y_train)
print("Best Cross-Validation Accuracy: {:.2f}".format(rfr_hyper.best_score_*-1))
print("Training RMSE: {:.2f}".format(rfr_hyper.score(X_train, y_train)*-1))
print("Testing RMSE: {:.2f}".format(rfr_hyper.score(X_test, y_test)*-1))
print("Training MSE: {:.2f}".format((rfr_hyper.score(X_train, y_train)*-1)**2))
print("Testing MSE: {:.2f}".format((rfr_hyper.score(X_test, y_test)*-1)**2))
print("Best Hyperparameters: {}".format(rfr_hyper.best_params_), "\n")
Best Cross-Validation Accuracy: 676.59 Training RMSE: 255.63 Testing RMSE: 702.04 Training MSE: 65345.75 Testing MSE: 492858.47 Best Hyperparameters: {'rfc__max_depth': 14, 'rfc__max_features': 'log2', 'rfc__n_estimators': 200, 'scaler': StandardScaler()}
# Predict Values
y_pred_rfr = rfr_hyper.predict(X_test)
# Plot Predicted vs Actual
plt.figure(figsize=(12,10))
sns.scatterplot(x=y_test, y=y_pred_rfr, color = "darkorange")
p1 = max(max(y_pred_rfr), max(y_test))
p2 = min(min(y_pred_rfr), min(y_test))
plt.plot([p1, p2], [p1, p2], '-', color = "blueviolet")
plt.xlim(0,10000)
plt.xlabel('Actual')
plt.ylabel('Predictions')
plt.title('Predicted vs Actual (Random Forest)', fontsize=16)
plt.show()
Both models had a better performance than the Naive and Dummy baseline. The model with hyperparameter tuning had a the lowest RMSE and MSE.