import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


bike_df_clean = pd.read_csv("bike_rental_clean.csv")
bike_df_clean.head(10)


bike_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      731 non-null    int64  
 1   yr          731 non-null    int64  
 2   mnth        731 non-null    int64  
 3   holiday     731 non-null    int64  
 4   weathersit  731 non-null    int64  
 5   temp        731 non-null    float64
 6   hum         731 non-null    float64
 7   windspeed   731 non-null    float64
 8   cnt         731 non-null    int64  
dtypes: float64(3), int64(6)
memory usage: 51.5 KB


# Convert mnth, yr, holiday, weekday, workingday and weathersit to categorical variables

bike_df_clean['season'] = bike_df_clean['season'].astype('category')
bike_df_clean['mnth'] = bike_df_clean['mnth'].astype('category')
bike_df_clean['yr'] = bike_df_clean['yr'].astype('category')
bike_df_clean['holiday'] = bike_df_clean['holiday'].astype('category')
bike_df_clean['weathersit'] = bike_df_clean['weathersit'].astype('category')


bike_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   season      731 non-null    category
 1   yr          731 non-null    category
 2   mnth        731 non-null    category
 3   holiday     731 non-null    category
 4   weathersit  731 non-null    category
 5   temp        731 non-null    float64 
 6   hum         731 non-null    float64 
 7   windspeed   731 non-null    float64 
 8   cnt         731 non-null    int64   
dtypes: category(5), float64(3), int64(1)
memory usage: 27.5 KB


# Separate target

features_name = bike_df_clean.columns[:8]
X = bike_df_clean[features_name]
y = bike_df_clean["cnt"]
print("The features are", X.columns)
print("The target variable is", y.name)

The features are Index(['season', 'yr', 'mnth', 'holiday', 'weathersit', 'temp', 'hum',
       'windspeed'],
      dtype='object')
The target variable is cnt


#Get train and test data using the same random_state value
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=123)
print('Training dataset: X_train=', X_train.shape, ', y_train', y_train.shape)
print('Testing dataset: X_test=', X_test.shape, ', y_test', y_test.shape)

Training dataset: X_train= (548, 8) , y_train (548,)
Testing dataset: X_test= (183, 8) , y_test (183,)


from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error


# Setup the pipeline
pipe_linear = Pipeline([('scaler', StandardScaler()),
                       ('linear', LinearRegression()),
                        
                       ])

# Fit the model
pipe_linear.fit(X_train, y_train)
# predict using model
y_train_linear = pipe_linear.predict(X_train)
y_pred_linear = pipe_linear.predict(X_test)

# Get scores
MSE_linear_train = mean_squared_error(y_train, y_train_linear)
MSE_linear_test = mean_squared_error(y_test, y_pred_linear)
RMSE_linear_train = np.sqrt(mean_squared_error(y_train, y_train_linear))
RMSE_linear_test = np.sqrt(mean_squared_error(y_test, y_pred_linear))
print('Linear Regression MSE train:', round(MSE_linear_train, 2))
print('Linear Regression MSE test:', round(MSE_linear_test, 2))
print('Linear Regression RMSE train:', round(RMSE_linear_train, 2))
print('Linear Regression RMSE test:', round(RMSE_linear_test, 2))

Linear Regression MSE train: 756295.89
Linear Regression MSE test: 863452.73
Linear Regression RMSE train: 869.65
Linear Regression RMSE test: 929.22


coeff = pipe_linear.named_steps['linear'].coef_
print(list(zip(coeff, features_name)))

[(546.068033106878, 'season'), (962.6662724752064, 'yr'), (-131.19972257198287, 'mnth'), (-95.79665194008305, 'holiday'), (-297.5109707542201, 'weathersit'), (1026.3173911635515, 'temp'), (-202.62993051131315, 'hum'), (-243.681425812743, 'windspeed')]


# Predict Values
y_pred_linear = pipe_linear.predict(X_test)

# Plot Predicted vs Actual
plt.figure(figsize=(12,10))
sns.scatterplot(x=y_test, y=y_pred_linear, color = "darkorange")
p1 = max(max(y_pred_linear), max(y_test))
p2 = min(min(y_pred_linear), min(y_test))
plt.plot([p1, p2], [p1, p2], '-', color = "blueviolet")
plt.xlim(0,10000)
plt.xlabel('Actual')
plt.ylabel('Predictions')
plt.title('Predicted vs Actual (Linear Regression)', fontsize=16)
plt.show()

	season	mnth	weathersit	temp	hum	windspeed	cnt
0	1	1	2	0.344167	0.805833	0.160446	985
1	1	1	2	0.363478	0.696087	0.248539	801
2	1	1	1	0.196364	0.437273	0.248309	1349
3	1	1	1	0.200000	0.590435	0.160296	1562
4	1	1	1	0.226957	0.436957	0.186900	1600
5	1	1	1	0.204348	0.518261	0.089565	1606
6	1	1	2	0.196522	0.498696	0.168726	1510
7	1	1	2	0.165000	0.535833	0.266804	959
8	1	1	1	0.138333	0.434167	0.361950	822
9	1	1	1	0.150833	0.482917	0.223267	1321

Linear Regression¶

Load the data¶

Data Preparation¶

Data Segregation¶

Model Evaluation¶

Observations¶