import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
bike_df_clean = pd.read_csv("bike_rental_clean.csv")
bike_df_clean.head(10)
season | yr | mnth | holiday | weathersit | temp | hum | windspeed | cnt | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 1 | 0 | 2 | 0.344167 | 0.805833 | 0.160446 | 985 |
1 | 1 | 0 | 1 | 0 | 2 | 0.363478 | 0.696087 | 0.248539 | 801 |
2 | 1 | 0 | 1 | 0 | 1 | 0.196364 | 0.437273 | 0.248309 | 1349 |
3 | 1 | 0 | 1 | 0 | 1 | 0.200000 | 0.590435 | 0.160296 | 1562 |
4 | 1 | 0 | 1 | 0 | 1 | 0.226957 | 0.436957 | 0.186900 | 1600 |
5 | 1 | 0 | 1 | 0 | 1 | 0.204348 | 0.518261 | 0.089565 | 1606 |
6 | 1 | 0 | 1 | 0 | 2 | 0.196522 | 0.498696 | 0.168726 | 1510 |
7 | 1 | 0 | 1 | 0 | 2 | 0.165000 | 0.535833 | 0.266804 | 959 |
8 | 1 | 0 | 1 | 0 | 1 | 0.138333 | 0.434167 | 0.361950 | 822 |
9 | 1 | 0 | 1 | 0 | 1 | 0.150833 | 0.482917 | 0.223267 | 1321 |
bike_df_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 731 entries, 0 to 730 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 season 731 non-null int64 1 yr 731 non-null int64 2 mnth 731 non-null int64 3 holiday 731 non-null int64 4 weathersit 731 non-null int64 5 temp 731 non-null float64 6 hum 731 non-null float64 7 windspeed 731 non-null float64 8 cnt 731 non-null int64 dtypes: float64(3), int64(6) memory usage: 51.5 KB
# Convert mnth, yr, holiday, weekday, workingday and weathersit to categorical variables
bike_df_clean['season'] = bike_df_clean['season'].astype('category')
bike_df_clean['mnth'] = bike_df_clean['mnth'].astype('category')
bike_df_clean['yr'] = bike_df_clean['yr'].astype('category')
bike_df_clean['holiday'] = bike_df_clean['holiday'].astype('category')
bike_df_clean['weathersit'] = bike_df_clean['weathersit'].astype('category')
bike_df_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 731 entries, 0 to 730 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 season 731 non-null category 1 yr 731 non-null category 2 mnth 731 non-null category 3 holiday 731 non-null category 4 weathersit 731 non-null category 5 temp 731 non-null float64 6 hum 731 non-null float64 7 windspeed 731 non-null float64 8 cnt 731 non-null int64 dtypes: category(5), float64(3), int64(1) memory usage: 27.5 KB
# Separate target
features_name = bike_df_clean.columns[:8]
X = bike_df_clean[features_name]
y = bike_df_clean["cnt"]
print("The features are", X.columns)
print("The target variable is", y.name)
The features are Index(['season', 'yr', 'mnth', 'holiday', 'weathersit', 'temp', 'hum', 'windspeed'], dtype='object') The target variable is cnt
#Get train and test data using the same random_state value
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=123)
print('Training dataset: X_train=', X_train.shape, ', y_train', y_train.shape)
print('Testing dataset: X_test=', X_test.shape, ', y_test', y_test.shape)
Training dataset: X_train= (548, 8) , y_train (548,) Testing dataset: X_test= (183, 8) , y_test (183,)
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
# Setup the pipeline
pipe_linear = Pipeline([('scaler', StandardScaler()),
('linear', LinearRegression()),
])
# Fit the model
pipe_linear.fit(X_train, y_train)
# predict using model
y_train_linear = pipe_linear.predict(X_train)
y_pred_linear = pipe_linear.predict(X_test)
# Get scores
MSE_linear_train = mean_squared_error(y_train, y_train_linear)
MSE_linear_test = mean_squared_error(y_test, y_pred_linear)
RMSE_linear_train = np.sqrt(mean_squared_error(y_train, y_train_linear))
RMSE_linear_test = np.sqrt(mean_squared_error(y_test, y_pred_linear))
print('Linear Regression MSE train:', round(MSE_linear_train, 2))
print('Linear Regression MSE test:', round(MSE_linear_test, 2))
print('Linear Regression RMSE train:', round(RMSE_linear_train, 2))
print('Linear Regression RMSE test:', round(RMSE_linear_test, 2))
Linear Regression MSE train: 756295.89 Linear Regression MSE test: 863452.73 Linear Regression RMSE train: 869.65 Linear Regression RMSE test: 929.22
coeff = pipe_linear.named_steps['linear'].coef_
print(list(zip(coeff, features_name)))
[(546.068033106878, 'season'), (962.6662724752064, 'yr'), (-131.19972257198287, 'mnth'), (-95.79665194008305, 'holiday'), (-297.5109707542201, 'weathersit'), (1026.3173911635515, 'temp'), (-202.62993051131315, 'hum'), (-243.681425812743, 'windspeed')]
# Predict Values
y_pred_linear = pipe_linear.predict(X_test)
# Plot Predicted vs Actual
plt.figure(figsize=(12,10))
sns.scatterplot(x=y_test, y=y_pred_linear, color = "darkorange")
p1 = max(max(y_pred_linear), max(y_test))
p2 = min(min(y_pred_linear), min(y_test))
plt.plot([p1, p2], [p1, p2], '-', color = "blueviolet")
plt.xlim(0,10000)
plt.xlabel('Actual')
plt.ylabel('Predictions')
plt.title('Predicted vs Actual (Linear Regression)', fontsize=16)
plt.show()
The linear regression had a better performance than the Naive and Dummy baseline. The coefficients of the linear regression indicate the effect of each variable on the target variable. For example, the variable humidity (hum
) had a coefficient of -202, which means that the variable is negatively correlated to the number of bike rentals. This indicates that the higher the humidity, the lower the bike rentals.