import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
bike_df_clean = pd.read_csv("bike_rental_clean.csv")
bike_df_clean.head(10)
| season | yr | mnth | holiday | weathersit | temp | hum | windspeed | cnt | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 1 | 0 | 2 | 0.344167 | 0.805833 | 0.160446 | 985 |
| 1 | 1 | 0 | 1 | 0 | 2 | 0.363478 | 0.696087 | 0.248539 | 801 |
| 2 | 1 | 0 | 1 | 0 | 1 | 0.196364 | 0.437273 | 0.248309 | 1349 |
| 3 | 1 | 0 | 1 | 0 | 1 | 0.200000 | 0.590435 | 0.160296 | 1562 |
| 4 | 1 | 0 | 1 | 0 | 1 | 0.226957 | 0.436957 | 0.186900 | 1600 |
| 5 | 1 | 0 | 1 | 0 | 1 | 0.204348 | 0.518261 | 0.089565 | 1606 |
| 6 | 1 | 0 | 1 | 0 | 2 | 0.196522 | 0.498696 | 0.168726 | 1510 |
| 7 | 1 | 0 | 1 | 0 | 2 | 0.165000 | 0.535833 | 0.266804 | 959 |
| 8 | 1 | 0 | 1 | 0 | 1 | 0.138333 | 0.434167 | 0.361950 | 822 |
| 9 | 1 | 0 | 1 | 0 | 1 | 0.150833 | 0.482917 | 0.223267 | 1321 |
bike_df_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 731 entries, 0 to 730 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 season 731 non-null int64 1 yr 731 non-null int64 2 mnth 731 non-null int64 3 holiday 731 non-null int64 4 weathersit 731 non-null int64 5 temp 731 non-null float64 6 hum 731 non-null float64 7 windspeed 731 non-null float64 8 cnt 731 non-null int64 dtypes: float64(3), int64(6) memory usage: 51.5 KB
# Convert mnth, yr, holiday, weekday, workingday and weathersit to categorical variables
bike_df_clean['season'] = bike_df_clean['season'].astype('category')
bike_df_clean['mnth'] = bike_df_clean['mnth'].astype('category')
bike_df_clean['yr'] = bike_df_clean['yr'].astype('category')
bike_df_clean['holiday'] = bike_df_clean['holiday'].astype('category')
bike_df_clean['weathersit'] = bike_df_clean['weathersit'].astype('category')
bike_df_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 731 entries, 0 to 730 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 season 731 non-null category 1 yr 731 non-null category 2 mnth 731 non-null category 3 holiday 731 non-null category 4 weathersit 731 non-null category 5 temp 731 non-null float64 6 hum 731 non-null float64 7 windspeed 731 non-null float64 8 cnt 731 non-null int64 dtypes: category(5), float64(3), int64(1) memory usage: 27.5 KB
# Separate target
features_name = bike_df_clean.columns[:8]
X = bike_df_clean[features_name]
y = bike_df_clean["cnt"]
print("The features are", X.columns)
print("The target variable is", y.name)
The features are Index(['season', 'yr', 'mnth', 'holiday', 'weathersit', 'temp', 'hum',
'windspeed'],
dtype='object')
The target variable is cnt
#Get train and test data using the same random_state value
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=123)
print('Training dataset: X_train=', X_train.shape, ', y_train', y_train.shape)
print('Testing dataset: X_test=', X_test.shape, ', y_test', y_test.shape)
Training dataset: X_train= (548, 8) , y_train (548,) Testing dataset: X_test= (183, 8) , y_test (183,)
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
# Setup the pipeline
pipe_linear = Pipeline([('scaler', StandardScaler()),
('linear', LinearRegression()),
])
# Fit the model
pipe_linear.fit(X_train, y_train)
# predict using model
y_train_linear = pipe_linear.predict(X_train)
y_pred_linear = pipe_linear.predict(X_test)
# Get scores
MSE_linear_train = mean_squared_error(y_train, y_train_linear)
MSE_linear_test = mean_squared_error(y_test, y_pred_linear)
RMSE_linear_train = np.sqrt(mean_squared_error(y_train, y_train_linear))
RMSE_linear_test = np.sqrt(mean_squared_error(y_test, y_pred_linear))
print('Linear Regression MSE train:', round(MSE_linear_train, 2))
print('Linear Regression MSE test:', round(MSE_linear_test, 2))
print('Linear Regression RMSE train:', round(RMSE_linear_train, 2))
print('Linear Regression RMSE test:', round(RMSE_linear_test, 2))
Linear Regression MSE train: 756295.89 Linear Regression MSE test: 863452.73 Linear Regression RMSE train: 869.65 Linear Regression RMSE test: 929.22
coeff = pipe_linear.named_steps['linear'].coef_
print(list(zip(coeff, features_name)))
[(546.068033106878, 'season'), (962.6662724752064, 'yr'), (-131.19972257198287, 'mnth'), (-95.79665194008305, 'holiday'), (-297.5109707542201, 'weathersit'), (1026.3173911635515, 'temp'), (-202.62993051131315, 'hum'), (-243.681425812743, 'windspeed')]
# Predict Values
y_pred_linear = pipe_linear.predict(X_test)
# Plot Predicted vs Actual
plt.figure(figsize=(12,10))
sns.scatterplot(x=y_test, y=y_pred_linear, color = "darkorange")
p1 = max(max(y_pred_linear), max(y_test))
p2 = min(min(y_pred_linear), min(y_test))
plt.plot([p1, p2], [p1, p2], '-', color = "blueviolet")
plt.xlim(0,10000)
plt.xlabel('Actual')
plt.ylabel('Predictions')
plt.title('Predicted vs Actual (Linear Regression)', fontsize=16)
plt.show()
The linear regression had a better performance than the Naive and Dummy baseline. The coefficients of the linear regression indicate the effect of each variable on the target variable. For example, the variable humidity (hum) had a coefficient of -202, which means that the variable is negatively correlated to the number of bike rentals. This indicates that the higher the humidity, the lower the bike rentals.