Commit 7103427

authored

Submit the BikeShare Dataset Project

1 parent a2463ad commit 7103427Copy full SHA for 7103427

File tree

1 file changed

+264

-0

lines changed

Project Explore BikeShare dataset
- Explore BikeShare.py

1 file changed

+264

-0

lines changed

`‎Project Explore BikeShare dataset/Explore BikeShare.py‎`

Lines changed: 264 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,264 @@`
	`1`	`+#####`
	`2`	`+##### Sondos Aabed Explores the Bokeshare Dataset`
	`3`	`+#####`
	`4`	`+`
	`5`	`+### Importing the necessary libraries`
	`6`	`+import time`
	`7`	`+import pandas as pd`
	`8`	`+import numpy as np`
	`9`	`+`
	`10`	`+#### this is the csv files dictionary`
	`11`	`+CITY_DATA = { 'chicago': 'chicago.csv',`
	`12`	`+ 'new york city': 'new_york_city.csv',`
	`13`	`+ 'washington': 'washington.csv' }`
	`14`	`+`
	`15`	`+#### in this method get the filters inputted by the user`
	`16`	`+def get_filters():`
	`17`	`+ """`
	`18`	`+ Asks user to specify a city, month, and day to analyze.`
	`19`	`+`
	`20`	`+ Returns:`
	`21`	`+ (str) city - name of the city to analyze`
	`22`	`+ (str) month - name of the month to filter by, or "all" to apply no month filter`
	`23`	`+ (str) day - name of the day of week to filter by, or "all" to apply no day filter`
	`24`	`+ """`
	`25`	`+ print('\nHello! Let\'s explore some US bikeshare data!')`
	`26`	`+ #####`
	`27`	`+ # In those cases an invalid input is handled by asking the user to try again until it's true input`
	`28`	`+ ####`
	`29`	`+ # get user input for city (chicago, new york city, washington).`
	`30`	`+ while True:`
	`31`	`+ city= input("\n Which City would like to explore? All, Chicago, New york city, Or Washington?\n")`
	`32`	`+ city=city.lower()`
	`33`	`+ if city not in ('all', 'new york city', 'chicago','washington'):`
	`34`	`+ print("Try to enter another city that is either: Chicago, New york city, Or Washington ")`
	`35`	`+ continue`
	`36`	`+ else:`
	`37`	`+ break`
	`38`	`+`
	`39`	`+ # get user input for month (all, january, february, ... , june)`
	`40`	`+ while True:`
	`41`	`+ month = input("\n In which of the months you want to explore? is it (all, january, february, ... , june)\n")`
	`42`	`+ month = month.lower()`
	`43`	`+ if month not in ('all','january','february','march','april','may','june','july','august','september','october','november','december'):`
	`44`	`+ print("Try to enter the month again, it wasn't a valid month!")`
	`45`	`+ continue`
	`46`	`+ else:`
	`47`	`+ break`
	`48`	`+`
	`49`	`+ # get user input for day of week (all, monday, tuesday, ... sunday)`
	`50`	`+ while True:`
	`51`	`+ day = input("\n What about the day you are looking for? is it (all, monday, tuesday, ... sunday)?\n")`
	`52`	`+ day = day.lower()`
	`53`	`+ if day not in ('sunday','monday','all','tuesday','wednesday','thursday','friday','saturday'):`
	`54`	`+ print("You entered a not valid day, try again")`
	`55`	`+ continue`
	`56`	`+ else:`
	`57`	`+ break`
	`58`	`+`
	`59`	`+ print('-'*40)`
	`60`	`+ return city, month, day`
	`61`	`+`
	`62`	`+# in this method load the dataset based on which city the user inputs`
	`63`	`+def load_data(city, month, day):`
	`64`	`+ """`
	`65`	`+ Loads data for the specified city and filters by month and day if applicable.`
	`66`	`+`
	`67`	`+ Args:`
	`68`	`+ (str) city - name of the city to analyze`
	`69`	`+ (str) month - name of the month to filter by, or "all" to apply no month filter`
	`70`	`+ (str) day - name of the day of week to filter by, or "all" to apply no day filter`
	`71`	`+ Returns:`
	`72`	`+ df - Pandas DataFrame containing city data filtered by month and day`
	`73`	`+ """`
	`74`	`+ # read the csv file using read_csv pandas based on the user input of cit`
	`75`	`+ # I have decided to add the option all because why not exploring all of them together giving a broader view`
	`76`	`+ if city not in ('all'):`
	`77`	`+ df = pd.read_csv(CITY_DATA[city])`
	`78`	`+ else:`
	`79`	`+ # for all dataframes if the user choses all combine them`
	`80`	`+ dfs = []`
	`81`	`+ for city, path in CITY_DATA.items(all):`
	`82`	`+ dfC = pd.read_csv(path)`
	`83`	`+ dfs.append(dfC)`
	`84`	`+`
	`85`	`+ df = pd.concat(dfs, ignore_index=True)`
	`86`	`+ ## print(df)`
	`87`	`+ return df`
	`88`	`+`
	`89`	`+## this metohd I created to clean the data`
	`90`	`+## cleaning the data included handling missing data`
	`91`	`+# also handle the high cardinality of dates`
	`92`	`+def clean_data(df):`
	`93`	`+ df = handle_dates(df)`
	`94`	`+ df = handle_missing(df)`
	`95`	`+ return df`
	`96`	`+`
	`97`	`+# this method I created to handle the missing data`
	`98`	`+def handle_missing(df):`
	`99`	`+ # I chose to fill them with Unknown`
	`100`	`+ print('We have {} missing enteries'.format(df.isnull().sum().sum()) )`
	`101`	`+ # fill Nan values using fillna method`
	`102`	`+ df.fillna('Unknown', inplace=True)`
	`103`	`+ print('These were filled by (Unknown) ')`
	`104`	`+ return df`
	`105`	`+`
	`106`	`+## this method I created to handle teh dates`
	`107`	`+def handle_dates(df):`
	`108`	`+ """`
	`109`	`+ Handle the dates as their datatypes using to_datetime pandas`
	`110`	`+ """`
	`111`	`+ df['Start Time'] = pd.to_datetime(df['Start Time'])`
	`112`	`+ df['End Time'] = pd.to_datetime(df['End Time'])`
	`113`	`+ df['Birth Year'] = pd.to_datetime(df['Birth Year'])`
	`114`	`+`
	`115`	`+ ## this coulmn has high cardinality so I better create new coulmns that I can filter by`
	`116`	`+ # Like the day of the week and the month and the year and the time`
	`117`	`+ df['start_month'] = df['Start Time'].dt.strftime('%B').str.lower()`
	`118`	`+ df['start_day'] = df['Start Time'].dt.strftime('%A').str.lower()`
	`119`	`+ df['start_year'] = df['Start Time'].dt.strftime('%Y')`
	`120`	`+ df['start_time'] = df['Start Time'].dt.strftime('%X')`
	`121`	`+`
	`122`	`+ df['end_month'] = df['End Time'].dt.strftime('%B').str.lower()`
	`123`	`+ df['end_day'] = df['End Time'].dt.strftime('%A').str.lower()`
	`124`	`+ df['end_year'] = df['End Time'].dt.strftime('%Y')`
	`125`	`+ df['end_time'] = df['End Time'].dt.strftime('%X')`
	`126`	`+`
	`127`	`+ # we have also the coulmn of Birth year`
	`128`	`+ # df['Birth Year'] = pd.to_datetime(df['Birth Year'], format='%Y')`
	`129`	`+ # this is not working for users stats`
	`130`	`+ # I have decided to handle this one as integer to get the min and max values`
	`131`	`+ df['Birth Year'] = pd.to_numeric(df['Birth Year'],errors='coerce' , downcast='integer')`
	`132`	`+`
	`133`	`+ # dropped them after I handeld them`
	`134`	`+ df.drop('Start Time', axis=1, inplace=True)`
	`135`	`+ df.drop('End Time', axis=1, inplace=True)`
	`136`	`+`
	`137`	`+ return df`
	`138`	`+`
	`139`	`+# this method get the time travel frequent times`
	`140`	`+# to get that I used the mode built-in method`
	`141`	`+def time_stats(df):`
	`142`	`+ """Displays statistics on the most frequent times of travel."""`
	`143`	`+`
	`144`	`+ print('\nCalculating The Most Frequent Times of Travel...\n')`
	`145`	`+ start_time = time.time()`
	`146`	`+`
	`147`	`+ # the most common month`
	`148`	`+ print('The most frequent month is: ', df['start_month'].mode()[0])`
	`149`	`+`
	`150`	`+ # the most common day of week`
	`151`	`+ print('The most frequent day is: ', df['start_day'].mode()[0])`
	`152`	`+`
	`153`	`+ # the most common start hour`
	`154`	`+ print('The most commoon start hour is: ', df['start_time'].mode()[0])`
	`155`	`+`
	`156`	`+ print("\nThis took %s seconds." % (time.time() - start_time))`
	`157`	`+ print('-'*40)`
	`158`	`+`
	`159`	`+# in this method I get some statics about the stations of the trip`
	`160`	`+# used mode and groupby`
	`161`	`+def station_stats(df):`
	`162`	`+ """Displays statistics on the most popular stations and trip."""`
	`163`	`+`
	`164`	`+ print('\nCalculating The Most Popular Stations and Trip...\n')`
	`165`	`+ start_time = time.time()`
	`166`	`+`
	`167`	`+ # most commonly used start station`
	`168`	`+ print('The most commonly used start station is: ', df['Start Station'].mode()[0] )`
	`169`	`+`
	`170`	`+ # most commonly used end station`
	`171`	`+ print('The most commonly used end station is: ', df['End Station'].mode()[0] )`
	`172`	`+`
	`173`	`+ # most frequent combination of start station and end station trip`
	`174`	`+ print('The most frequent combination of start station and end station trip is: ',`
	`175`	`+ df.groupby(['Start Station','End Station']).size().idxmax())`
	`176`	`+`
	`177`	`+ print("\nThis took %s seconds." % (time.time() - start_time))`
	`178`	`+ print('-'*40)`
	`179`	`+`
	`180`	`+# In this method I get some statics about the trip duration`
	`181`	`+# used the sum, mean aggregation functions`
	`182`	`+def trip_duration_stats(df):`
	`183`	`+ """Displays statistics on the total and average trip duration."""`
	`184`	`+`
	`185`	`+ print('\nCalculating Trip Duration...\n')`
	`186`	`+ start_time = time.time()`
	`187`	`+`
	`188`	`+ # total travel time`
	`189`	`+ # the trip duration coulmn is in seconds`
	`190`	`+ # to make it more readable I convert it to days by dividing it on 86400`
	`191`	`+ print('The total travel time in hours is: ', df['Trip Duration'].sum()/86400)`
	`192`	`+`
	`193`	`+ # mean travel time`
	`194`	`+ print('The average travel time in minutes is: ', df['Trip Duration'].mean()/60)`
	`195`	`+`
	`196`	`+ print("\nThis took %s seconds." % (time.time() - start_time))`
	`197`	`+ print('-'*40)`
	`198`	`+`
	`199`	`+# In this method I get some statics about the users`
	`200`	`+# Using`
	`201`	`+def user_stats(df):`
	`202`	`+ """Displays statistics on bikeshare users."""`
	`203`	`+`
	`204`	`+ print('\nCalculating User Stats...\n')`
	`205`	`+ start_time = time.time()`
	`206`	`+`
	`207`	`+ # counts of user types`
	`208`	`+ print('In this city, we have diffrent types of users as follows: ')`
	`209`	`+ print(df['User Type'].value_counts())`
	`210`	`+`
	`211`	`+ # counts users based on gender`
	`212`	`+ print('The total count of each gender is as follow: ')`
	`213`	`+ print('Females:', df['Gender'].value_counts().get("Female", 0))`
	`214`	`+ print('Males:', df['Gender'].value_counts().get("Male", 0))`
	`215`	`+ print('Unknown:', df['Gender'].value_counts().get("Unknown", 0))`
	`216`	`+`
	`217`	`+ # So because I don't want to include the unknown value of these I will use a filter on the dataset`
	`218`	`+ # earliest year of birth`
	`219`	`+ print('The earliest year of birth is: ', df['Birth Year'].min())`
	`220`	`+`
	`221`	`+ # Something doesn't add up here because it first displays to me the (unknown) so because I used it to fill the missing data`
	`222`	`+ # I am thinking to impute the missing birth year with the mode of it`
	`223`	`+ # but this will effect the time since I already imputed why impute twice`
	`224`	`+ # so what can I do ?`
	`225`	`+`
	`226`	`+ # most recent of birth`
	`227`	`+ print('The most recent year of birth is: ', df['Birth Year'].max())`
	`228`	`+`
	`229`	`+ # most common year of birth`
	`230`	`+ print('The most common year of birth is: ', df['Birth Year'].mode()[0])`
	`231`	`+`
	`232`	`+ print("\nThis took %s seconds." % (time.time() - start_time))`
	`233`	`+ print('-'*40)`
	`234`	`+`
	`235`	`+def main():`
	`236`	`+ # start the program until the user hits no`
	`237`	`+ while True:`
	`238`	`+ # gets the filters`
	`239`	`+ city, month, day = get_filters()`
	`240`	`+`
	`241`	`+ # load the dataset`
	`242`	`+ df = load_data(city, month, day)`
	`243`	`+`
	`244`	`+ # clean the dataset`
	`245`	`+ df= clean_data(df)`
	`246`	`+`
	`247`	`+ # Display diffrent statics of the dataset`
	`248`	`+ time_stats(df)`
	`249`	`+ station_stats(df)`
	`250`	`+ trip_duration_stats(df)`
	`251`	`+ user_stats(df)`
	`252`	`+`
	`253`	`+ restart = input('\nWould you like to restart? Enter yes or no.\n')`
	`254`	`+ if restart.lower() != 'yes':`
	`255`	`+ break`
	`256`	`+`
	`257`	`+############################`
	`258`	`+`
	`259`	`+# In this project the dataset of diffrent city is explored`
	`260`	`+# by the user interactivly of diffrent cities`
	`261`	`+`
	`262`	`+############################`
	`263`	`+if __name__ == "__main__":`
	`264`	`+ main()`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 7103427

File tree

1 file changed

1 file changed

`‎Project Explore BikeShare dataset/Explore BikeShare.py‎`

0 commit comments