# Importing the neccessary packages
import os
# missingno is a useful package for visualizing missing values
import missingno as msno
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime
%matplotlib inline


#Mixed datatypes on start_station_id and end_station_id. Quick solution is to just convert it into a string.
os.chdir("Cyclistic_CSV_Datasets")
tripDf = pd.read_csv('First_Tidy_trip_data.csv', dtype = {'start_station_id': str, 'end_station_id': str})


tripDf.head()


#Visualize missing values
msno.matrix(tripDf)
plt.show()


#Convert to datetime
tripDf['started_at'] = pd.to_datetime(tripDf['started_at'], format="%Y%m%d %H:%M:%S")
tripDf['ended_at'] = pd.to_datetime(tripDf['ended_at'], format="%Y%m%d %H:%M:%S")


tripDf['ride_duration'] = tripDf['ended_at'] - tripDf['started_at']


tripDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3826978 entries, 0 to 3826977
Data columns (total 14 columns):
 #   Column              Dtype          
---  ------              -----          
 0   ride_id             object         
 1   rideable_type       object         
 2   started_at          datetime64[ns] 
 3   ended_at            datetime64[ns] 
 4   start_station_name  object         
 5   start_station_id    object         
 6   end_station_name    object         
 7   end_station_id      object         
 8   start_lat           float64        
 9   start_lng           float64        
 10  end_lat             float64        
 11  end_lng             float64        
 12  member_casual       object         
 13  ride_duration       timedelta64[ns]
dtypes: datetime64[ns](2), float64(4), object(7), timedelta64[ns](1)
memory usage: 408.8+ MB


missing_end = tripDf[tripDf['end_station_name'].isna()]
missing_end.head(15)


missing_start = tripDf[tripDf['start_station_name'].isna()]
missing_start.head(15)


# if tripDf[tripDf[['start_station_name', 'start_station_id', 'end_station_name', 'end_station_id']].isna()]:
#     tripDf.dropna()

cols = ['start_station_name', 'start_station_id', 'end_station_name', 'end_station_id']
# cols = ['start_station_name', 'end_station_name']

na_list = tripDf[ 
        (tripDf['start_station_name'].isna()) & 
        (tripDf['start_station_id'].isna()) & 
        (tripDf['end_station_name'].isna()) &
        (tripDf['end_station_id'].isna()) 
    ]

na_list.shape

(86439, 14)


tripDf['day_borrowed'] = tripDf['started_at'].dt.day_name()

tripDf.head()


#Convert to Weekday or Weekend

tripDf['weekend_weekday'] = np.where(tripDf['started_at'].dt.dayofweek > 4, 'Weekend', 'Weekday')


tripDf.head()


tripDf['rideable_type'].unique() # 'docked_bike', 'electric_bike', 'classic_bike

tripDf['rideable_type'] = tripDf.loc[:,'rideable_type'].str.title().str.replace('_', ' ')

tripDf.head(10)


member_casual_count = tripDf['member_casual'].value_counts()

# member_casual_count.plot(kind = 'pie', title = "Member vs Casual")

fig, ax = plt.subplots()

ax.pie(
        member_casual_count, 
        autopct='%1.1f%%', 
        labels = ['Member', 'Casual'],
        startangle=90,
    )

plt.title("Member vs Casual")
fig.set_size_inches([8, 8])

plt.show()
#There are 226001 members and 1566977 casuals

member_casual_proportion = tripDf['member_casual'].value_counts(normalize=True)
print(member_casual_proportion * 100)

member    59.054455
casual    40.945545
Name: member_casual, dtype: float64


tripDf['ride_duration'].describe()

count                      3826978
mean     0 days 00:24:42.689237304
std      0 days 06:17:58.646501483
min             -21 days +19:50:02
25%                0 days 00:07:48
50%                0 days 00:14:21
75%                0 days 00:26:23
max               40 days 18:40:02
Name: ride_duration, dtype: object


less_0_sec = tripDf.loc[tripDf['ride_duration'] <= datetime.timedelta(seconds = 0), 'ride_duration'].shape # 10976 rows


excluded = tripDf[tripDf['ride_duration'] >= datetime.timedelta(seconds = 0)] #this ensures no negative values are included

# tripDf.shape #3,826,978

excluded.shape #3,816,421. about 10,000 rows are excluded.

excluded['ride_duration'].describe()

count                      3816421
mean     0 days 00:27:36.851399780
std      0 days 04:09:19.799293299
min                0 days 00:00:00
25%                0 days 00:07:51
50%                0 days 00:14:24
75%                0 days 00:26:26
max               40 days 18:40:02
Name: ride_duration, dtype: object


member_only = excluded[excluded['member_casual'] == 'member']
casual_only = excluded[excluded['member_casual'] == 'casual']

#Double check if I filtered it right
print(member_only['member_casual'].unique())
print(casual_only['member_casual'].unique())

print('\n')
print('This statistical summary excludes casual riders')
print(member_only['ride_duration'].describe()) 
print('\n')
print('This statistical summary excludes member riders')
print(casual_only['ride_duration'].describe())

['member']
['casual']


This statistical summary excludes casual riders
count                      2252699
mean     0 days 00:15:59.287310466
std      0 days 01:28:36.864600156
min                0 days 00:00:00
25%                0 days 00:06:28
50%                0 days 00:11:23
75%                0 days 00:19:55
max               40 days 18:40:02
Name: ride_duration, dtype: object


This statistical summary excludes member riders
count                      1563722
mean     0 days 00:44:21.762711658
std      0 days 06:14:04.612557996
min                0 days 00:00:00
25%                0 days 00:11:24
50%                0 days 00:20:56
75%                0 days 00:39:43
max               38 days 16:03:53
Name: ride_duration, dtype: object


#adding a new column containing the month name
tripDf['month_borrowed'] = pd.to_datetime(tripDf['started_at']).dt.strftime('%B %Y')
# tripDf['month_borrowed'] = pd.to_datetime(tripDf['month_borrowed'], format = '%B %Y')

#make sure to exclude negative values
excluded = tripDf[tripDf['ride_duration'] >= datetime.timedelta(seconds = 0)]
excluded['ride_duration'].describe()

count                      3816421
mean     0 days 00:27:36.851399780
std      0 days 04:09:19.799293299
min                0 days 00:00:00
25%                0 days 00:07:51
50%                0 days 00:14:24
75%                0 days 00:26:26
max               40 days 18:40:02
Name: ride_duration, dtype: object


summary_ride_duration = excluded.groupby('month_borrowed')['ride_duration'].describe()
# print(summary_ride_duration)

grouped = excluded.groupby(['month_borrowed', 'member_casual'], as_index = False)['ride_duration']
ride_avg = grouped.apply(lambda x: np.mean(x))

df = pd.DataFrame(ride_avg)

#convert to minutes
df['ride_duration'] = df['ride_duration'].astype('timedelta64[m]')

#sort by month
df['month_borrowed'] = pd.to_datetime(df['month_borrowed'])
df = df.sort_values(by = 'month_borrowed')

#convert back to (Month_Name Year) format
df['month_borrowed'] = pd.to_datetime(df['month_borrowed']).dt.strftime('%B %Y')

sns.set_style("darkgrid")

plt.figure(figsize=[15,7])
ride_dur = sns.catplot(
    x = 'ride_duration', 
    y = 'member_casual',
    data = df, 
    col = 'month_borrowed',
    col_wrap = 4,
    kind = 'bar')

ride_dur.set(
    ylabel = "Average Ride Duration (Minutes)"
)

ride_dur.fig.suptitle('Average Ride Duration (Minutes) Per Month by Rider Type',  y = 1.03)
ride_dur.set_titles("Date: {col_name}")
plt.subplots_adjust(hspace=0.4, wspace=0.1)

<Figure size 1080x504 with 0 Axes>


fig_dims = (15, 9)
fig, ax = plt.subplots(figsize=fig_dims)

day_borrowed = sns.countplot(
    data = excluded,
    x = 'day_borrowed',
    ax = ax
)

day_borrowed.set_title('Number of people renting bikes on each day of the week')
plt.xticks(rotation = 25)
day_borrowed.set(xlabel = 'Day of the Week Borrowed', ylabel = 'Total number of rentee')

[Text(0.5, 0, 'Day of the Week Borrowed'),
 Text(0, 0.5, 'Total number of rentee')]


fig_dims = (15, 9)
fig, ax = plt.subplots(figsize=fig_dims)

day_borrowed = sns.countplot(
    data = member_only,
    x = 'day_borrowed',
    ax = ax
)

day_borrowed.set_title('When do members usually rent their bikes on each day of the week')
plt.xticks(rotation = 25)

day_borrowed.set(xlabel = 'Day of the Week Borrowed', ylabel = 'Number of Members that Rented Bikes')

[Text(0.5, 0, 'Day of the Week Borrowed'),
 Text(0, 0.5, 'Number of Members that Rented Bikes')]


fig_dims = (15, 9)
fig, ax = plt.subplots(figsize=fig_dims)

day_borrowed = sns.countplot(
    data = casual_only,
    x = 'day_borrowed',
    ax = ax
)

day_borrowed.set_title('When do casuals usually rent their bikes on each day of the week')
plt.xticks(rotation = 25)

day_borrowed.set(xlabel = 'Day of the Week Borrowed', ylabel = 'Number of Casual Riders that Rented Bikes')

[Text(0.5, 0, 'Day of the Week Borrowed'),
 Text(0, 0.5, 'Number of Casual Riders that Rented Bikes')]


#Top 5 start stations
top_5_start = excluded["start_station_name"].value_counts().head(5).plot(kind = 'bar', rot = 55, title = 'Top 5 Start Stations', xlabel = 'Start Stations')
plt.show()
top_5_end = excluded["end_station_name"].value_counts().head(5).plot(kind = 'bar', rot = 55, title = 'Top 5 End Stations', xlabel = 'End Stations')
plt.show()


top_5_start = casual_only["start_station_name"].value_counts().head(5).plot(kind = 'bar', rot = 55, title = "Start Station (Casual Only)", xlabel = 'Start Stations')
plt.show()
top_5_end = casual_only["end_station_name"].value_counts().head(5).plot(kind = 'bar', rot = 55, title = "End Station (Casual Only)", xlabel = 'End Stations')
plt.show()


ride_type = excluded['rideable_type'].value_counts(normalize = True)

fig_dims = (15, 10)
fig, ax = plt.subplots(figsize=fig_dims)
plt.rcParams["axes.labelsize"] = 20

ride_p = sns.countplot(
    data = excluded,
    x = 'rideable_type',
    hue = 'member_casual',
    ax = ax
)

ride_p.set_title('Preference of rideable types from April 2020 until April 2021',  y = 1.03)
ride_p.set(xlabel = 'Type of Bikes', ylabel = 'Number of Bikes Rented')

plt.show()


summary = excluded.groupby('month_borrowed')['rideable_type'].describe()
# equivalent to summary = excluded.groupby('month_borrowed')['rideable_type'].aggregate(['count', pd.Series.mode])

sorted_summary = summary.sort_values(by = "count", ascending = False)

print(sorted_summary)

                 count unique           top    freq
month_borrowed                                     
August 2020     619592      2   Docked Bike  553403
July 2020       549735      2   Docked Bike  547801
September 2020  530826      2   Docked Bike  402479
October 2020    386742      2   Docked Bike  234574
June 2020       342536      1   Docked Bike  342536
April 2021      337225      3  Classic Bike  214616
November 2020   258851      2   Docked Bike  150734
March 2021      228494      3  Classic Bike  152544
May 2020        200102      1   Docked Bike  200102
December 2020   131139      3  Classic Bike   70616
January 2021     96832      3  Classic Bike   61699
April 2020       84725      1   Docked Bike   84725
February 2021    49622      3  Classic Bike   35012


fig_dims = (10, 7)
fig, ax = plt.subplots(figsize = fig_dims)

ride_p = sns.countplot(
    data = excluded,
    y = 'month_borrowed',
    hue = 'rideable_type',
    ax = ax
)

# ax.set_xticklabels(ax.get_xticklabels(),rotation = 30)

ride_p.set(xlabel = 'Number of Monthly Trips', ylabel = 'Month Borrowed')
ride_p.set_title('Number of Monthly Trips by Rideable Types',  y = 1.03)

Text(0.5, 1.03, 'Number of Monthly Trips by Rideable Types')


ride_grouped = sns.catplot(
    data = excluded,
    y = 'month_borrowed',
    kind = 'count',
    hue = 'rideable_type',
    col = 'member_casual'
)

ride_grouped.set(
    ylabel = "Month Borrowed",
    xlabel = 'Number of Monthly Trips'
)

ride_grouped.fig.suptitle('Number of Monthly trips Separated By Rider Type',  y = 1.03)
ride_grouped.set_titles("Rider Type: {col_name}")
plt.subplots_adjust(hspace=0.4, wspace=0.1)


summary_rider_type = excluded.groupby('month_borrowed')['member_casual'].describe()
# print(summary_rider_type) <- This is to double check

fig_dims = (10, 7)
fig, ax = plt.subplots(figsize = fig_dims)

ride_p = sns.countplot(
    data = excluded,
    x = 'month_borrowed',
    hue = 'member_casual',
    ax = ax
)

ride_p.set(xlabel = 'Month Borrowed', ylabel = 'Number of Monthly Trips')
ride_p.set_title('Number of Monthly Trips by Rider Types',  y = 1.03)
plt.xticks(rotation = 90)
plt.show();

	ride_id	rideable_type	started_at	ended_at	start_station_name	start_station_id	end_station_name	end_station_id	start_lat	start_lng	end_lat	end_lng	member_casual
0	A847FADBBC638E45	docked_bike	2020-04-26 17:45:14	2020-04-26 18:12:03	Eckhart Park	86	Lincoln Ave & Diversey Pkwy	152.0	41.8964	-87.6610	41.9322	-87.6586	member
1	5405B80E996FF60D	docked_bike	2020-04-17 17:08:54	2020-04-17 17:17:03	Drake Ave & Fullerton Ave	503	Kosciuszko Park	499.0	41.9244	-87.7154	41.9306	-87.7238	member
2	5DD24A79A4E006F4	docked_bike	2020-04-01 17:54:13	2020-04-01 18:08:36	McClurg Ct & Erie St	142	Indiana Ave & Roosevelt Rd	255.0	41.8945	-87.6179	41.8679	-87.6230	member
3	2A59BBDF5CDBA725	docked_bike	2020-04-07 12:50:19	2020-04-07 13:02:31	California Ave & Division St	216	Wood St & Augusta Blvd	657.0	41.9030	-87.6975	41.8992	-87.6722	member
4	27AD306C119C6158	docked_bike	2020-04-18 10:22:59	2020-04-18 11:15:54	Rush St & Hubbard St	125	Sheridan Rd & Lawrence Ave	323.0	41.8902	-87.6262	41.9695	-87.6547	casual

	ride_id	rideable_type	started_at	ended_at	start_station_name	start_station_id	end_station_name	end_station_id	start_lat	start_lng	end_lat	end_lng	member_casual	ride_duration
1001	5E2BD03BCA180FBA	docked_bike	2020-04-07 11:53:08	2020-04-07 12:28:35	Wells St & Concord Ln	289	NaN	NaN	41.9121	-87.6347	NaN	NaN	member	0 days 00:35:27
1864	BD5813A6101E9BF4	docked_bike	2020-04-20 12:24:48	2020-04-20 12:29:46	Racine Ave & Wrightwood Ave	343	NaN	NaN	41.9289	-87.6590	NaN	NaN	member	0 days 00:04:58
2167	228691849C2081EE	docked_bike	2020-04-16 08:41:56	2020-04-16 11:33:48	Racine Ave & 18th St	15	NaN	NaN	41.8582	-87.6565	NaN	NaN	member	0 days 02:51:52
2458	ED7750BCEEE87174	docked_bike	2020-04-09 15:33:45	2020-04-09 16:34:54	Morgan Ave & 14th Pl	137	NaN	NaN	41.8624	-87.6511	NaN	NaN	casual	0 days 01:01:09
3834	1E00C457DCDA0835	docked_bike	2020-04-25 06:52:02	2020-04-25 07:17:54	Lake Shore Dr & Wellington Ave	157	NaN	NaN	41.9367	-87.6368	NaN	NaN	member	0 days 00:25:52
5100	0F7201752882C449	docked_bike	2020-04-30 20:47:40	2020-04-30 21:19:00	Ashland Ave & Chicago Ave	350	NaN	NaN	41.8960	-87.6677	NaN	NaN	member	0 days 00:31:20
5794	97C00C77F12AF5AE	docked_bike	2020-04-10 11:54:44	2020-04-10 12:02:39	Wells St & Huron St	53	NaN	NaN	41.8947	-87.6344	NaN	NaN	member	0 days 00:07:55
7251	E0AFDC4D2358B027	docked_bike	2020-04-20 13:37:30	2020-04-20 14:27:48	State St & Pearson St	106	NaN	NaN	41.8974	-87.6287	NaN	NaN	casual	0 days 00:50:18
7404	8BAB90CEE039C959	docked_bike	2020-04-28 16:21:26	2020-04-28 17:03:01	Bissell St & Armitage Ave	113	NaN	NaN	41.9184	-87.6522	NaN	NaN	casual	0 days 00:41:35
9034	8BC76B362DA665B7	docked_bike	2020-04-19 13:23:39	2020-04-19 13:58:52	Central Park Ave & North Ave	508	NaN	NaN	41.9097	-87.7166	NaN	NaN	member	0 days 00:35:13
9551	CA182D7FDACC427D	docked_bike	2020-04-08 12:44:18	2020-04-09 13:44:09	Phillips Ave & 79th St	579	NaN	NaN	41.7518	-87.5652	NaN	NaN	casual	1 days 00:59:51
9578	74D33D6F25179262	docked_bike	2020-04-18 13:01:33	2020-04-18 14:20:45	Wells St & Evergreen Ave	291	NaN	NaN	41.9067	-87.6348	NaN	NaN	casual	0 days 01:19:12
9779	72F15ADC62F9A5E6	docked_bike	2020-04-12 14:24:21	2020-04-12 14:51:00	Lincoln Ave & Waveland Ave	257	NaN	NaN	41.9488	-87.6753	NaN	NaN	member	0 days 00:26:39
10125	2DE1FC1F7CC6AD71	docked_bike	2020-04-26 11:09:20	2020-04-26 11:20:18	Ashland Ave & Wrightwood Ave	166	NaN	NaN	41.9288	-87.6685	NaN	NaN	member	0 days 00:10:58
10614	A5A3A8807AFE36AC	docked_bike	2020-04-18 15:21:28	2020-04-18 15:49:09	Lincoln Ave & Belmont Ave	131	NaN	NaN	41.9394	-87.6684	NaN	NaN	member	0 days 00:27:41

	ride_id	rideable_type	started_at	ended_at	start_station_name	start_station_id	end_station_name	end_station_id	start_lat	start_lng	end_lat	end_lng	member_casual	ride_duration
630367	60742256DFFFCA29	electric_bike	2020-07-31 08:30:34	2020-07-31 08:57:54	NaN	NaN	Racine Ave & 35th St	367.0	41.90	-87.69	41.830704	-87.656085	member	0 days 00:27:20
642029	EBBD4FE9C8A95116	electric_bike	2020-07-29 19:02:25	2020-07-29 19:22:40	NaN	NaN	Western Ave & Walton St	374.0	41.90	-87.69	41.898404	-87.686592	member	0 days 00:20:15
653657	976336C6499A7189	electric_bike	2020-07-30 22:02:45	2020-07-30 22:17:54	NaN	NaN	Clarendon Ave & Leland Ave	251.0	41.94	-87.65	41.967841	-87.649991	member	0 days 00:15:09
670853	EC549CDABDE45F98	electric_bike	2020-07-31 15:54:41	2020-07-31 16:00:35	NaN	NaN	NaN	NaN	41.92	-87.70	41.910000	-87.680000	member	0 days 00:05:54
670856	70F53BFE303499AC	electric_bike	2020-07-31 16:08:26	2020-07-31 16:15:29	NaN	NaN	NaN	NaN	41.91	-87.68	41.920000	-87.700000	member	0 days 00:07:03
670859	11814668BAB6FCA3	electric_bike	2020-07-31 12:56:53	2020-07-31 13:13:27	NaN	NaN	Aberdeen St & Randolph St	621.0	41.86	-87.63	41.884041	-87.654301	member	0 days 00:16:34
686820	D66AC7ACF66F8FF6	electric_bike	2020-07-30 08:32:14	2020-07-30 08:40:04	NaN	NaN	Canal St & Madison St	174.0	41.87	-87.65	41.881760	-87.640163	member	0 days 00:07:50
709525	5196A992F09481DB	electric_bike	2020-07-30 17:25:18	2020-07-30 17:25:35	NaN	NaN	NaN	NaN	41.90	-87.62	41.900000	-87.620000	member	0 days 00:00:17
709526	2DB97E499E497D3D	electric_bike	2020-07-30 17:28:20	2020-07-30 17:44:40	NaN	NaN	Financial Pl & Ida B Wells Dr	89.0	41.90	-87.62	41.874970	-87.633277	member	0 days 00:16:20
709527	3C5018A592AB147C	electric_bike	2020-07-31 18:40:31	2020-07-31 18:48:56	NaN	NaN	Keystone Ave & Montrose Ave	495.0	41.97	-87.73	41.961081	-87.729016	member	0 days 00:08:25
709528	836BE807C2FFC1DB	electric_bike	2020-07-31 18:39:02	2020-07-31 18:39:13	NaN	NaN	NaN	NaN	41.97	-87.73	41.970000	-87.730000	member	0 days 00:00:11
718527	D53BD878D7CF3DDA	electric_bike	2020-07-30 17:17:45	2020-07-30 17:35:15	NaN	NaN	NaN	NaN	41.86	-87.61	41.880000	-87.620000	member	0 days 00:17:30
756172	821A0CD56410BE85	electric_bike	2020-07-31 01:10:40	2020-07-31 01:21:08	NaN	NaN	Leavitt St & Belmont Ave	664.0	41.91	-87.68	41.939366	-87.683256	casual	0 days 00:10:28
756190	60A3D6C3238E5D1B	electric_bike	2020-07-29 15:38:31	2020-07-29 15:55:27	NaN	NaN	Prairie Ave & 43rd St	410.0	41.80	-87.59	41.816691	-87.619329	casual	0 days 00:16:56
756201	FA92AAEFE87BD7EA	electric_bike	2020-07-15 10:20:31	2020-07-15 10:20:57	NaN	NaN	NaN	NaN	41.89	-87.68	41.890000	-87.680000	casual	0 days 00:00:26

	ride_id	rideable_type	started_at	ended_at	start_station_name	start_station_id	end_station_name	end_station_id	start_lat	start_lng	end_lat	end_lng	member_casual	ride_duration	day_borrowed
0	A847FADBBC638E45	docked_bike	2020-04-26 17:45:14	2020-04-26 18:12:03	Eckhart Park	86	Lincoln Ave & Diversey Pkwy	152.0	41.8964	-87.6610	41.9322	-87.6586	member	0 days 00:26:49	Sunday
1	5405B80E996FF60D	docked_bike	2020-04-17 17:08:54	2020-04-17 17:17:03	Drake Ave & Fullerton Ave	503	Kosciuszko Park	499.0	41.9244	-87.7154	41.9306	-87.7238	member	0 days 00:08:09	Friday
2	5DD24A79A4E006F4	docked_bike	2020-04-01 17:54:13	2020-04-01 18:08:36	McClurg Ct & Erie St	142	Indiana Ave & Roosevelt Rd	255.0	41.8945	-87.6179	41.8679	-87.6230	member	0 days 00:14:23	Wednesday
3	2A59BBDF5CDBA725	docked_bike	2020-04-07 12:50:19	2020-04-07 13:02:31	California Ave & Division St	216	Wood St & Augusta Blvd	657.0	41.9030	-87.6975	41.8992	-87.6722	member	0 days 00:12:12	Tuesday
4	27AD306C119C6158	docked_bike	2020-04-18 10:22:59	2020-04-18 11:15:54	Rush St & Hubbard St	125	Sheridan Rd & Lawrence Ave	323.0	41.8902	-87.6262	41.9695	-87.6547	casual	0 days 00:52:55	Saturday

	ride_id	rideable_type	started_at	ended_at	start_station_name	start_station_id	end_station_name	end_station_id	start_lat	start_lng	end_lat	end_lng	member_casual	ride_duration	day_borrowed	weekend_weekday
0	A847FADBBC638E45	docked_bike	2020-04-26 17:45:14	2020-04-26 18:12:03	Eckhart Park	86	Lincoln Ave & Diversey Pkwy	152.0	41.8964	-87.6610	41.9322	-87.6586	member	0 days 00:26:49	Sunday	Weekend
1	5405B80E996FF60D	docked_bike	2020-04-17 17:08:54	2020-04-17 17:17:03	Drake Ave & Fullerton Ave	503	Kosciuszko Park	499.0	41.9244	-87.7154	41.9306	-87.7238	member	0 days 00:08:09	Friday	Weekday
2	5DD24A79A4E006F4	docked_bike	2020-04-01 17:54:13	2020-04-01 18:08:36	McClurg Ct & Erie St	142	Indiana Ave & Roosevelt Rd	255.0	41.8945	-87.6179	41.8679	-87.6230	member	0 days 00:14:23	Wednesday	Weekday
3	2A59BBDF5CDBA725	docked_bike	2020-04-07 12:50:19	2020-04-07 13:02:31	California Ave & Division St	216	Wood St & Augusta Blvd	657.0	41.9030	-87.6975	41.8992	-87.6722	member	0 days 00:12:12	Tuesday	Weekday
4	27AD306C119C6158	docked_bike	2020-04-18 10:22:59	2020-04-18 11:15:54	Rush St & Hubbard St	125	Sheridan Rd & Lawrence Ave	323.0	41.8902	-87.6262	41.9695	-87.6547	casual	0 days 00:52:55	Saturday	Weekend

	ride_id	rideable_type	started_at	ended_at	start_station_name	start_station_id	end_station_name	end_station_id	start_lat	start_lng	end_lat	end_lng	member_casual	ride_duration	day_borrowed	weekend_weekday
0	A847FADBBC638E45	Docked Bike	2020-04-26 17:45:14	2020-04-26 18:12:03	Eckhart Park	86	Lincoln Ave & Diversey Pkwy	152.0	41.8964	-87.6610	41.9322	-87.6586	member	0 days 00:26:49	Sunday	Weekend
1	5405B80E996FF60D	Docked Bike	2020-04-17 17:08:54	2020-04-17 17:17:03	Drake Ave & Fullerton Ave	503	Kosciuszko Park	499.0	41.9244	-87.7154	41.9306	-87.7238	member	0 days 00:08:09	Friday	Weekday
2	5DD24A79A4E006F4	Docked Bike	2020-04-01 17:54:13	2020-04-01 18:08:36	McClurg Ct & Erie St	142	Indiana Ave & Roosevelt Rd	255.0	41.8945	-87.6179	41.8679	-87.6230	member	0 days 00:14:23	Wednesday	Weekday
3	2A59BBDF5CDBA725	Docked Bike	2020-04-07 12:50:19	2020-04-07 13:02:31	California Ave & Division St	216	Wood St & Augusta Blvd	657.0	41.9030	-87.6975	41.8992	-87.6722	member	0 days 00:12:12	Tuesday	Weekday
4	27AD306C119C6158	Docked Bike	2020-04-18 10:22:59	2020-04-18 11:15:54	Rush St & Hubbard St	125	Sheridan Rd & Lawrence Ave	323.0	41.8902	-87.6262	41.9695	-87.6547	casual	0 days 00:52:55	Saturday	Weekend
5	356216E875132F61	Docked Bike	2020-04-30 17:55:47	2020-04-30 18:01:11	Mies van der Rohe Way & Chicago Ave	173	Streeter Dr & Grand Ave	35.0	41.8969	-87.6217	41.8923	-87.6120	member	0 days 00:05:24	Thursday	Weekday
6	A2759CB06A81F2BC	Docked Bike	2020-04-02 14:47:19	2020-04-02 14:52:32	Streeter Dr & Grand Ave	35	Fairbanks St & Superior St	635.0	41.8923	-87.6120	41.8957	-87.6201	member	0 days 00:05:13	Thursday	Weekday
7	FC8BC2E2D54F35ED	Docked Bike	2020-04-07 12:22:20	2020-04-07 13:38:09	Ogden Ave & Roosevelt Rd	434	Western Ave & Congress Pkwy	382.0	41.8665	-87.6847	41.8747	-87.6864	casual	0 days 01:15:49	Tuesday	Weekday
8	9EC5648678DE06E6	Docked Bike	2020-04-15 10:30:11	2020-04-15 10:35:55	LaSalle Dr & Huron St	627	Larrabee St & Division St	359.0	41.8949	-87.6323	41.9035	-87.6434	casual	0 days 00:05:44	Wednesday	Weekday
9	A8FFF89140C33017	Docked Bike	2020-04-04 15:02:28	2020-04-04 15:19:47	Kedzie Ave & Lake St	377	Central Park Ave & North Ave	508.0	41.8846	-87.7063	41.9097	-87.7166	member	0 days 00:17:19	Saturday	Weekend

Case Study - How Does a Bike-Share Navigate Speedy Success?¶

Part of the Google Analystics Case Study Project.

by Whoong Zi Wei¶

First Phase: Identify the business task.¶

In order to achieve my goal, I have narrowed down the questions to the following:¶

Prepare Phase¶

The data has these following fields (or columns):¶

Process Phase¶

Tools to be used: Python.¶

Data Cleaning Process¶

Visualize missing values¶

Let's get ride_duration so as to aid the cleaning process¶

Let's take a look at the missing values on end_stations and see why there are missing values¶

Looking at the first 15 rows of missing values inside the columns regarding end_stations, it seems that it is missing either due to the customer canceled their ride or other reasons¶

Let's now take a look at why start_stations are missing¶

Seems mixed as to why they are missing. Some, with just seconds of ride_duration can be safely assumed that the rider must've changed his mind. Now I will drop rows that have both start and end station empty.¶

There are more 86,439 rows with all four of the start and end station id/name missing in values. As a result, I cannot safely determine whether the riders had merely changed their minds or there was a techincal error back then.¶

After much deliberation and checking on the internet, I've realized that these missing values won't affect my analysis and conclusions, as further on I've also realized I should only be concerned on values that are in the negatives on the ride duration.¶

Now I will determine what day it was borrowed on that week and whether it is a weekend or weekday¶

Adding the weekend_weekday and day_borrowed fields will further assist me to visualize certain trends later on the Analysis phase.¶

One last thing before proceeding to the analyze phase, I will now remove the underscore in the rideable_type column and capitalize the first letters to make it look better for visualization later on.¶

Analyze Phase¶

As a refresher, these are the questions I am interested in:¶

First Question: What is the total number of trips for members and casuals, and what proportion of total trips do they represent?¶

There are 2,260,001 members and 1,566,977 casuals. Evidently, over the 12 months of Cyclistic's bike program there are more members than casuals. About 59% of all trips were made by members, whereas nearly 41% of trips were undertaken by casual riders¶

Second Question: What are the average ride lengths for members and casuals respectively?¶

There seems to be 10,976 rows of data that have less than 0 seconds, meaning these may be either due to techincal error or human error. I will exclude data that are less than 0 seconds to generate a more accurate insight.¶

Now that I've excluded the ones that have negative values on the ride_duration, the statistical summary makes more sense now. The average that the customers typically rides per session is 27 minuters, which is quite reasonable.¶

What is the average ride duration of casual riders and member riders?¶

Surprisingly, casual riders' average as compared to member riders is longer! Therefore, I can deduce that members don't actually ride longer than casual riders, but the other way around instead!¶

Finally, let's now calculate the average ride time (minutes) on each month for each rider type¶

It seems that riders ride the longest in terms of average from February until September. And from October until January there's a lower average ride duration for both member and casual riders.¶

Let's now look at whether the day of the week has any correlation with how frequent casual/member riders would rent bikes.¶

As we can see above, most riders prefer to cycle on a Saturday. As a result, this is an interesting insight that may be useful later on.¶

Let's now take a look if there's a difference between when member/casual riders tend to ride.¶

Next Question: What are the most common starting and ending stations for each?¶

These top 5 locations are the most rented in both the start and end stations if we account both members and casual riders.¶

These are the top 5 locations that casual riders mostly rent their bikes. If the company wishes to target on casual riders for their advertisement, these 5 locations are highly recommended for the company to concentrate their advertisement on.¶

Last Question: Is there a preference for the rideable type for members and casuals?¶

Let's examine the rideable_type column!¶

As can be seen above, no matter if it's member or casual, the docked bike is the most popular, followed by electric bikes and classic bikes.¶

This data doens't seem accurate if we look at it overall. Let's look at it by by month instead.¶

Let's take a look at how many rider types overall rode in each month.¶

With this chart, it seems that beginning in April the number of trips starts to rise and peaks at August. After August, the number of trips starts to fall.¶

Act Phase (Conclusion)¶