1
+ #####
2
+ ##### Sondos Aabed Explores the Bokeshare Dataset
3
+ #####
4
+
5
+ ### Importing the necessary libraries
6
+ import time
7
+ import pandas as pd
8
+ import numpy as np
9
+
10
+ #### this is the csv files dictionary
11
+ CITY_DATA = { 'chicago' : 'chicago.csv' ,
12
+ 'new york city' : 'new_york_city.csv' ,
13
+ 'washington' : 'washington.csv' }
14
+
15
+ #### in this method get the filters inputted by the user
16
+ def get_filters ():
17
+ """
18
+ Asks user to specify a city, month, and day to analyze.
19
+
20
+ Returns:
21
+ (str) city - name of the city to analyze
22
+ (str) month - name of the month to filter by, or "all" to apply no month filter
23
+ (str) day - name of the day of week to filter by, or "all" to apply no day filter
24
+ """
25
+ print ('\n Hello! Let\' s explore some US bikeshare data!' )
26
+ #####
27
+ # In those cases an invalid input is handled by asking the user to try again until it's true input
28
+ ####
29
+ # get user input for city (chicago, new york city, washington).
30
+ while True :
31
+ city = input ("\n Which City would like to explore? All, Chicago, New york city, Or Washington?\n " )
32
+ city = city .lower ()
33
+ if city not in ('all' , 'new york city' , 'chicago' ,'washington' ):
34
+ print ("Try to enter another city that is either: Chicago, New york city, Or Washington " )
35
+ continue
36
+ else :
37
+ break
38
+
39
+ # get user input for month (all, january, february, ... , june)
40
+ while True :
41
+ month = input ("\n In which of the months you want to explore? is it (all, january, february, ... , june)\n " )
42
+ month = month .lower ()
43
+ if month not in ('all' ,'january' ,'february' ,'march' ,'april' ,'may' ,'june' ,'july' ,'august' ,'september' ,'october' ,'november' ,'december' ):
44
+ print ("Try to enter the month again, it wasn't a valid month!" )
45
+ continue
46
+ else :
47
+ break
48
+
49
+ # get user input for day of week (all, monday, tuesday, ... sunday)
50
+ while True :
51
+ day = input ("\n What about the day you are looking for? is it (all, monday, tuesday, ... sunday)?\n " )
52
+ day = day .lower ()
53
+ if day not in ('sunday' ,'monday' ,'all' ,'tuesday' ,'wednesday' ,'thursday' ,'friday' ,'saturday' ):
54
+ print ("You entered a not valid day, try again" )
55
+ continue
56
+ else :
57
+ break
58
+
59
+ print ('-' * 40 )
60
+ return city , month , day
61
+
62
+ # in this method load the dataset based on which city the user inputs
63
+ def load_data (city , month , day ):
64
+ """
65
+ Loads data for the specified city and filters by month and day if applicable.
66
+
67
+ Args:
68
+ (str) city - name of the city to analyze
69
+ (str) month - name of the month to filter by, or "all" to apply no month filter
70
+ (str) day - name of the day of week to filter by, or "all" to apply no day filter
71
+ Returns:
72
+ df - Pandas DataFrame containing city data filtered by month and day
73
+ """
74
+ # read the csv file using read_csv pandas based on the user input of cit
75
+ # I have decided to add the option all because why not exploring all of them together giving a broader view
76
+ if city not in ('all' ):
77
+ df = pd .read_csv (CITY_DATA [city ])
78
+ else :
79
+ # for all dataframes if the user choses all combine them
80
+ dfs = []
81
+ for city , path in CITY_DATA .items (all ):
82
+ dfC = pd .read_csv (path )
83
+ dfs .append (dfC )
84
+
85
+ df = pd .concat (dfs , ignore_index = True )
86
+ ## print(df)
87
+ return df
88
+
89
+ ## this metohd I created to clean the data
90
+ ## cleaning the data included handling missing data
91
+ # also handle the high cardinality of dates
92
+ def clean_data (df ):
93
+ df = handle_dates (df )
94
+ df = handle_missing (df )
95
+ return df
96
+
97
+ # this method I created to handle the missing data
98
+ def handle_missing (df ):
99
+ # I chose to fill them with Unknown
100
+ print ('We have {} missing enteries' .format (df .isnull ().sum ().sum ()) )
101
+ # fill Nan values using fillna method
102
+ df .fillna ('Unknown' , inplace = True )
103
+ print ('These were filled by (Unknown) ' )
104
+ return df
105
+
106
+ ## this method I created to handle teh dates
107
+ def handle_dates (df ):
108
+ """
109
+ Handle the dates as their datatypes using to_datetime pandas
110
+ """
111
+ df ['Start Time' ] = pd .to_datetime (df ['Start Time' ])
112
+ df ['End Time' ] = pd .to_datetime (df ['End Time' ])
113
+ df ['Birth Year' ] = pd .to_datetime (df ['Birth Year' ])
114
+
115
+ ## this coulmn has high cardinality so I better create new coulmns that I can filter by
116
+ # Like the day of the week and the month and the year and the time
117
+ df ['start_month' ] = df ['Start Time' ].dt .strftime ('%B' ).str .lower ()
118
+ df ['start_day' ] = df ['Start Time' ].dt .strftime ('%A' ).str .lower ()
119
+ df ['start_year' ] = df ['Start Time' ].dt .strftime ('%Y' )
120
+ df ['start_time' ] = df ['Start Time' ].dt .strftime ('%X' )
121
+
122
+ df ['end_month' ] = df ['End Time' ].dt .strftime ('%B' ).str .lower ()
123
+ df ['end_day' ] = df ['End Time' ].dt .strftime ('%A' ).str .lower ()
124
+ df ['end_year' ] = df ['End Time' ].dt .strftime ('%Y' )
125
+ df ['end_time' ] = df ['End Time' ].dt .strftime ('%X' )
126
+
127
+ # we have also the coulmn of Birth year
128
+ # df['Birth Year'] = pd.to_datetime(df['Birth Year'], format='%Y')
129
+ # this is not working for users stats
130
+ # I have decided to handle this one as integer to get the min and max values
131
+ df ['Birth Year' ] = pd .to_numeric (df ['Birth Year' ],errors = 'coerce' , downcast = 'integer' )
132
+
133
+ # dropped them after I handeld them
134
+ df .drop ('Start Time' , axis = 1 , inplace = True )
135
+ df .drop ('End Time' , axis = 1 , inplace = True )
136
+
137
+ return df
138
+
139
+ # this method get the time travel frequent times
140
+ # to get that I used the mode built-in method
141
+ def time_stats (df ):
142
+ """Displays statistics on the most frequent times of travel."""
143
+
144
+ print ('\n Calculating The Most Frequent Times of Travel...\n ' )
145
+ start_time = time .time ()
146
+
147
+ # the most common month
148
+ print ('The most frequent month is: ' , df ['start_month' ].mode ()[0 ])
149
+
150
+ # the most common day of week
151
+ print ('The most frequent day is: ' , df ['start_day' ].mode ()[0 ])
152
+
153
+ # the most common start hour
154
+ print ('The most commoon start hour is: ' , df ['start_time' ].mode ()[0 ])
155
+
156
+ print ("\n This took %s seconds." % (time .time () - start_time ))
157
+ print ('-' * 40 )
158
+
159
+ # in this method I get some statics about the stations of the trip
160
+ # used mode and groupby
161
+ def station_stats (df ):
162
+ """Displays statistics on the most popular stations and trip."""
163
+
164
+ print ('\n Calculating The Most Popular Stations and Trip...\n ' )
165
+ start_time = time .time ()
166
+
167
+ # most commonly used start station
168
+ print ('The most commonly used start station is: ' , df ['Start Station' ].mode ()[0 ] )
169
+
170
+ # most commonly used end station
171
+ print ('The most commonly used end station is: ' , df ['End Station' ].mode ()[0 ] )
172
+
173
+ # most frequent combination of start station and end station trip
174
+ print ('The most frequent combination of start station and end station trip is: ' ,
175
+ df .groupby (['Start Station' ,'End Station' ]).size ().idxmax ())
176
+
177
+ print ("\n This took %s seconds." % (time .time () - start_time ))
178
+ print ('-' * 40 )
179
+
180
+ # In this method I get some statics about the trip duration
181
+ # used the sum, mean aggregation functions
182
+ def trip_duration_stats (df ):
183
+ """Displays statistics on the total and average trip duration."""
184
+
185
+ print ('\n Calculating Trip Duration...\n ' )
186
+ start_time = time .time ()
187
+
188
+ # total travel time
189
+ # the trip duration coulmn is in seconds
190
+ # to make it more readable I convert it to days by dividing it on 86400
191
+ print ('The total travel time in hours is: ' , df ['Trip Duration' ].sum ()/ 86400 )
192
+
193
+ # mean travel time
194
+ print ('The average travel time in minutes is: ' , df ['Trip Duration' ].mean ()/ 60 )
195
+
196
+ print ("\n This took %s seconds." % (time .time () - start_time ))
197
+ print ('-' * 40 )
198
+
199
+ # In this method I get some statics about the users
200
+ # Using
201
+ def user_stats (df ):
202
+ """Displays statistics on bikeshare users."""
203
+
204
+ print ('\n Calculating User Stats...\n ' )
205
+ start_time = time .time ()
206
+
207
+ # counts of user types
208
+ print ('In this city, we have diffrent types of users as follows: ' )
209
+ print (df ['User Type' ].value_counts ())
210
+
211
+ # counts users based on gender
212
+ print ('The total count of each gender is as follow: ' )
213
+ print ('Females:' , df ['Gender' ].value_counts ().get ("Female" , 0 ))
214
+ print ('Males:' , df ['Gender' ].value_counts ().get ("Male" , 0 ))
215
+ print ('Unknown:' , df ['Gender' ].value_counts ().get ("Unknown" , 0 ))
216
+
217
+ # So because I don't want to include the unknown value of these I will use a filter on the dataset
218
+ # earliest year of birth
219
+ print ('The earliest year of birth is: ' , df ['Birth Year' ].min ())
220
+
221
+ # Something doesn't add up here because it first displays to me the (unknown) so because I used it to fill the missing data
222
+ # I am thinking to impute the missing birth year with the mode of it
223
+ # but this will effect the time since I already imputed why impute twice
224
+ # so what can I do ?
225
+
226
+ # most recent of birth
227
+ print ('The most recent year of birth is: ' , df ['Birth Year' ].max ())
228
+
229
+ # most common year of birth
230
+ print ('The most common year of birth is: ' , df ['Birth Year' ].mode ()[0 ])
231
+
232
+ print ("\n This took %s seconds." % (time .time () - start_time ))
233
+ print ('-' * 40 )
234
+
235
+ def main ():
236
+ # start the program until the user hits no
237
+ while True :
238
+ # gets the filters
239
+ city , month , day = get_filters ()
240
+
241
+ # load the dataset
242
+ df = load_data (city , month , day )
243
+
244
+ # clean the dataset
245
+ df = clean_data (df )
246
+
247
+ # Display diffrent statics of the dataset
248
+ time_stats (df )
249
+ station_stats (df )
250
+ trip_duration_stats (df )
251
+ user_stats (df )
252
+
253
+ restart = input ('\n Would you like to restart? Enter yes or no.\n ' )
254
+ if restart .lower () != 'yes' :
255
+ break
256
+
257
+ ############################
258
+
259
+ # In this project the dataset of diffrent city is explored
260
+ # by the user interactivly of diffrent cities
261
+
262
+ ############################
263
+ if __name__ == "__main__" :
264
+ main ()
0 commit comments