Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 7103427

Browse files
Submit the BikeShare Dataset Project
1 parent a2463ad commit 7103427

File tree

1 file changed

+264
-0
lines changed

1 file changed

+264
-0
lines changed
Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
#####
2+
##### Sondos Aabed Explores the Bokeshare Dataset
3+
#####
4+
5+
### Importing the necessary libraries
6+
import time
7+
import pandas as pd
8+
import numpy as np
9+
10+
#### this is the csv files dictionary
11+
CITY_DATA = { 'chicago': 'chicago.csv',
12+
'new york city': 'new_york_city.csv',
13+
'washington': 'washington.csv' }
14+
15+
#### in this method get the filters inputted by the user
16+
def get_filters():
17+
"""
18+
Asks user to specify a city, month, and day to analyze.
19+
20+
Returns:
21+
(str) city - name of the city to analyze
22+
(str) month - name of the month to filter by, or "all" to apply no month filter
23+
(str) day - name of the day of week to filter by, or "all" to apply no day filter
24+
"""
25+
print('\nHello! Let\'s explore some US bikeshare data!')
26+
#####
27+
# In those cases an invalid input is handled by asking the user to try again until it's true input
28+
####
29+
# get user input for city (chicago, new york city, washington).
30+
while True:
31+
city= input("\n Which City would like to explore? All, Chicago, New york city, Or Washington?\n")
32+
city=city.lower()
33+
if city not in ('all', 'new york city', 'chicago','washington'):
34+
print("Try to enter another city that is either: Chicago, New york city, Or Washington ")
35+
continue
36+
else:
37+
break
38+
39+
# get user input for month (all, january, february, ... , june)
40+
while True:
41+
month = input("\n In which of the months you want to explore? is it (all, january, february, ... , june)\n")
42+
month = month.lower()
43+
if month not in ('all','january','february','march','april','may','june','july','august','september','october','november','december'):
44+
print("Try to enter the month again, it wasn't a valid month!")
45+
continue
46+
else:
47+
break
48+
49+
# get user input for day of week (all, monday, tuesday, ... sunday)
50+
while True:
51+
day = input("\n What about the day you are looking for? is it (all, monday, tuesday, ... sunday)?\n")
52+
day = day.lower()
53+
if day not in ('sunday','monday','all','tuesday','wednesday','thursday','friday','saturday'):
54+
print("You entered a not valid day, try again")
55+
continue
56+
else:
57+
break
58+
59+
print('-'*40)
60+
return city, month, day
61+
62+
# in this method load the dataset based on which city the user inputs
63+
def load_data(city, month, day):
64+
"""
65+
Loads data for the specified city and filters by month and day if applicable.
66+
67+
Args:
68+
(str) city - name of the city to analyze
69+
(str) month - name of the month to filter by, or "all" to apply no month filter
70+
(str) day - name of the day of week to filter by, or "all" to apply no day filter
71+
Returns:
72+
df - Pandas DataFrame containing city data filtered by month and day
73+
"""
74+
# read the csv file using read_csv pandas based on the user input of cit
75+
# I have decided to add the option all because why not exploring all of them together giving a broader view
76+
if city not in ('all'):
77+
df = pd.read_csv(CITY_DATA[city])
78+
else:
79+
# for all dataframes if the user choses all combine them
80+
dfs = []
81+
for city, path in CITY_DATA.items(all):
82+
dfC = pd.read_csv(path)
83+
dfs.append(dfC)
84+
85+
df = pd.concat(dfs, ignore_index=True)
86+
## print(df)
87+
return df
88+
89+
## this metohd I created to clean the data
90+
## cleaning the data included handling missing data
91+
# also handle the high cardinality of dates
92+
def clean_data(df):
93+
df = handle_dates(df)
94+
df = handle_missing(df)
95+
return df
96+
97+
# this method I created to handle the missing data
98+
def handle_missing(df):
99+
# I chose to fill them with Unknown
100+
print('We have {} missing enteries'.format(df.isnull().sum().sum()) )
101+
# fill Nan values using fillna method
102+
df.fillna('Unknown', inplace=True)
103+
print('These were filled by (Unknown) ')
104+
return df
105+
106+
## this method I created to handle teh dates
107+
def handle_dates(df):
108+
"""
109+
Handle the dates as their datatypes using to_datetime pandas
110+
"""
111+
df['Start Time'] = pd.to_datetime(df['Start Time'])
112+
df['End Time'] = pd.to_datetime(df['End Time'])
113+
df['Birth Year'] = pd.to_datetime(df['Birth Year'])
114+
115+
## this coulmn has high cardinality so I better create new coulmns that I can filter by
116+
# Like the day of the week and the month and the year and the time
117+
df['start_month'] = df['Start Time'].dt.strftime('%B').str.lower()
118+
df['start_day'] = df['Start Time'].dt.strftime('%A').str.lower()
119+
df['start_year'] = df['Start Time'].dt.strftime('%Y')
120+
df['start_time'] = df['Start Time'].dt.strftime('%X')
121+
122+
df['end_month'] = df['End Time'].dt.strftime('%B').str.lower()
123+
df['end_day'] = df['End Time'].dt.strftime('%A').str.lower()
124+
df['end_year'] = df['End Time'].dt.strftime('%Y')
125+
df['end_time'] = df['End Time'].dt.strftime('%X')
126+
127+
# we have also the coulmn of Birth year
128+
# df['Birth Year'] = pd.to_datetime(df['Birth Year'], format='%Y')
129+
# this is not working for users stats
130+
# I have decided to handle this one as integer to get the min and max values
131+
df['Birth Year'] = pd.to_numeric(df['Birth Year'],errors='coerce' , downcast='integer')
132+
133+
# dropped them after I handeld them
134+
df.drop('Start Time', axis=1, inplace=True)
135+
df.drop('End Time', axis=1, inplace=True)
136+
137+
return df
138+
139+
# this method get the time travel frequent times
140+
# to get that I used the mode built-in method
141+
def time_stats(df):
142+
"""Displays statistics on the most frequent times of travel."""
143+
144+
print('\nCalculating The Most Frequent Times of Travel...\n')
145+
start_time = time.time()
146+
147+
# the most common month
148+
print('The most frequent month is: ', df['start_month'].mode()[0])
149+
150+
# the most common day of week
151+
print('The most frequent day is: ', df['start_day'].mode()[0])
152+
153+
# the most common start hour
154+
print('The most commoon start hour is: ', df['start_time'].mode()[0])
155+
156+
print("\nThis took %s seconds." % (time.time() - start_time))
157+
print('-'*40)
158+
159+
# in this method I get some statics about the stations of the trip
160+
# used mode and groupby
161+
def station_stats(df):
162+
"""Displays statistics on the most popular stations and trip."""
163+
164+
print('\nCalculating The Most Popular Stations and Trip...\n')
165+
start_time = time.time()
166+
167+
# most commonly used start station
168+
print('The most commonly used start station is: ', df['Start Station'].mode()[0] )
169+
170+
# most commonly used end station
171+
print('The most commonly used end station is: ', df['End Station'].mode()[0] )
172+
173+
# most frequent combination of start station and end station trip
174+
print('The most frequent combination of start station and end station trip is: ',
175+
df.groupby(['Start Station','End Station']).size().idxmax())
176+
177+
print("\nThis took %s seconds." % (time.time() - start_time))
178+
print('-'*40)
179+
180+
# In this method I get some statics about the trip duration
181+
# used the sum, mean aggregation functions
182+
def trip_duration_stats(df):
183+
"""Displays statistics on the total and average trip duration."""
184+
185+
print('\nCalculating Trip Duration...\n')
186+
start_time = time.time()
187+
188+
# total travel time
189+
# the trip duration coulmn is in seconds
190+
# to make it more readable I convert it to days by dividing it on 86400
191+
print('The total travel time in hours is: ', df['Trip Duration'].sum()/86400)
192+
193+
# mean travel time
194+
print('The average travel time in minutes is: ', df['Trip Duration'].mean()/60)
195+
196+
print("\nThis took %s seconds." % (time.time() - start_time))
197+
print('-'*40)
198+
199+
# In this method I get some statics about the users
200+
# Using
201+
def user_stats(df):
202+
"""Displays statistics on bikeshare users."""
203+
204+
print('\nCalculating User Stats...\n')
205+
start_time = time.time()
206+
207+
# counts of user types
208+
print('In this city, we have diffrent types of users as follows: ')
209+
print(df['User Type'].value_counts())
210+
211+
# counts users based on gender
212+
print('The total count of each gender is as follow: ')
213+
print('Females:', df['Gender'].value_counts().get("Female", 0))
214+
print('Males:', df['Gender'].value_counts().get("Male", 0))
215+
print('Unknown:', df['Gender'].value_counts().get("Unknown", 0))
216+
217+
# So because I don't want to include the unknown value of these I will use a filter on the dataset
218+
# earliest year of birth
219+
print('The earliest year of birth is: ', df['Birth Year'].min())
220+
221+
# Something doesn't add up here because it first displays to me the (unknown) so because I used it to fill the missing data
222+
# I am thinking to impute the missing birth year with the mode of it
223+
# but this will effect the time since I already imputed why impute twice
224+
# so what can I do ?
225+
226+
# most recent of birth
227+
print('The most recent year of birth is: ', df['Birth Year'].max())
228+
229+
# most common year of birth
230+
print('The most common year of birth is: ', df['Birth Year'].mode()[0])
231+
232+
print("\nThis took %s seconds." % (time.time() - start_time))
233+
print('-'*40)
234+
235+
def main():
236+
# start the program until the user hits no
237+
while True:
238+
# gets the filters
239+
city, month, day = get_filters()
240+
241+
# load the dataset
242+
df = load_data(city, month, day)
243+
244+
# clean the dataset
245+
df= clean_data(df)
246+
247+
# Display diffrent statics of the dataset
248+
time_stats(df)
249+
station_stats(df)
250+
trip_duration_stats(df)
251+
user_stats(df)
252+
253+
restart = input('\nWould you like to restart? Enter yes or no.\n')
254+
if restart.lower() != 'yes':
255+
break
256+
257+
############################
258+
259+
# In this project the dataset of diffrent city is explored
260+
# by the user interactivly of diffrent cities
261+
262+
############################
263+
if __name__ == "__main__":
264+
main()

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /