Commit 382602a

authored

Create Get4Details.py

1 parent 49fa1fd commit 382602aCopy full SHA for 382602a

File tree

1 file changed

+126

-0

lines changed

Code
- Get4Details.py

1 file changed

+126

-0

lines changed

`‎Code/Get4Details.py`

Lines changed: 126 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,126 @@`
	`1`	`+#################################################`
	`2`	`+### 4. GET THE DETAILS FROM ###`
	`3`	`+### OF EACH CARD ###`
	`4`	`+#################################################`
	`5`	`+`
	`6`	`+# NOTE: this code takes around 20 mins runtime`
	`7`	`+# due to the number of pages to scrap.`
	`8`	`+`
	`9`	`+# Authors of Code: Noam Shmuel & Lasha Gochiashvili`
	`10`	`+`
	`11`	`+# Load main packages and libraries`
	`12`	`+from selenium import webdriver`
	`13`	`+import pandas as pd`
	`14`	`+from selenium.webdriver.common.by import By`
	`15`	`+from selenium.webdriver.common.keys import Keys`
	`16`	`+from selenium.webdriver.support.ui import Select`
	`17`	`+import time`
	`18`	`+import numpy as np`
	`19`	`+from selenium.webdriver.support.ui import WebDriverWait`
	`20`	`+from selenium.webdriver.support import expected_conditions as EC`
	`21`	`+`
	`22`	`+# Webdriver settings`
	`23`	`+gecko_path = 'C:/Users/Lasha/anaconda3/geckodriver.exe'`
	`24`	`+`
	`25`	`+options = webdriver.firefox.options.Options()`
	`26`	`+options.headless = True`
	`27`	`+`
	`28`	`+driver = webdriver.Firefox(options = options, executable_path = gecko_path)`
	`29`	`+`
	`30`	`+'''`
	`31`	`+By this function we will create a Data Frame to save full details`
	`32`	`+of scrapping step by step. There are four parts of the function.`
	`33`	`+'''`
	`34`	`+def getCardDetails(country, url):`
	`35`	`+ ### PART I`
	`36`	`+ # Declaring variables to save the results of scraping`
	`37`	`+ driver.get(url)`
	`38`	`+ local_df = pd.DataFrame(columns=['country','card_url','general','country_link','city', 'PM2.5','date','hour'])`
	`39`	`+ pm = None`
	`40`	`+ date = None`
	`41`	`+ hour = None`
	`42`	`+ general = None`
	`43`	`+ city = None`
	`44`	`+ country_link = None`
	`45`	`+`
	`46`	`+ try:`
	`47`	`+ #wait = WebDriverWait(driver, 3)`
	`48`	`+ #wait.until(EC.presence_of_element_located((By.ID, 'location-fold-stats')))`
	`49`	`+ time.sleep(2)`
	`50`	`+`
	`51`	`+ ### PART II`
	`52`	`+ # Using Xpath we are getting the full text of the sibling that comes`
	`53`	`+ # after the text containing "PM2.5". We will split the full text to`
	`54`	`+ # generate variables for our Data Frame such as "pm", "date" & "hour".`
	`55`	`+ try:`
	`56`	`+ pm_date = driver.find_element(By.XPATH, '//dt[text() = "PM2.5"]/following-sibling::dd[1]').text`
	`57`	`+ # Scraping pollution details from each location page`
	`58`	`+ # and splitting them to save in the relevant variables`
	`59`	`+ text = pm_date.split('μg/m3 at ')`
	`60`	`+ pm = float(text[0])`
	`61`	`+ full_date = text[1].split(' ')`
	`62`	`+ date = full_date[0]`
	`63`	`+ hour = full_date[1]`
	`64`	`+ except:`
	`65`	`+ pm = None`
	`66`	`+ date = None`
	`67`	`+ hour = None`
	`68`	`+`
	`69`	`+ ### PART III`
	`70`	`+ # Using class name we are getting the full text to generate variables`
	`71`	`+ # for our Data Frame such as "country", "card_url", "general", "city"`
	`72`	`+ # & "country_link".`
	`73`	`+ try:`
	`74`	`+ titles = driver.find_element_by_class_name('inpage__title').text`
	`75`	`+ # Scrapping location details and creating variables`
	`76`	`+ titles_split = titles.split('\n')`
	`77`	`+ general = titles_split[0]`
	`78`	`+ titles_split = titles_split[1].split('in ')[1].split(' ')`
	`79`	`+ city = titles_split[0]`
	`80`	`+ country_link = titles_split[1]`
	`81`	`+ except:`
	`82`	`+ general = None`
	`83`	`+ city = None`
	`84`	`+ country_link = None`
	`85`	`+ except:`
	`86`	`+ print ("Something went wrong with WAIT")`
	`87`	`+`
	`88`	`+ ### PART IV`
	`89`	`+ # Saving each variables that we created into the Data Frame`
	`90`	`+ d = {'country':country,'card_url':url, 'general':general,'country_link':country_link,'city':city, 'PM2.5':pm,'date':date,'hour':hour }`
	`91`	`+ local_df = local_df.append(d, ignore_index=True)`
	`92`	`+ return (local_df)`
	`93`	`+`
	`94`	`+time.sleep(2)`
	`95`	`+`
	`96`	`+# Open the .csv file to use links in order to fill our new Data Frame`
	`97`	`+# with all the necessary information`
	`98`	`+df = pd.read_csv('3Links_Of_Cards.csv')`
	`99`	`+df = df.dropna() # Remove NAs`
	`100`	`+#print(df)`
	`101`	`+time.sleep(2)`
	`102`	`+`
	`103`	`+# Creating Data Frame and setting column names`
	`104`	`+df2 = pd.DataFrame(columns=['country','card_url','general','country_link','city', 'PM2.5','date','hour'])`
	`105`	`+`
	`106`	`+# Adding country, country_url and cardURL to the Data Frame`
	`107`	`+for index, row in df.iterrows():`
	`108`	`+ myDf = pd.DataFrame(columns=['country','card_url','general','country_link','city', 'PM2.5','date','hour'])`
	`109`	`+ card_url = (row['cardURL'])`
	`110`	`+ country = (row['country'])`
	`111`	`+ time.sleep(1)`
	`112`	`+ myDf = getCardDetails(country, card_url)`
	`113`	`+ df2 = df2.append(myDf, ignore_index=True)`
	`114`	`+ if (index == 99): # We limit the iteration to 100 and not more`
	`115`	`+ break`
	`116`	`+`
	`117`	`+# Printing our new Data Frame`
	`118`	`+print("\n")`
	`119`	`+print(df2)`
	`120`	`+`
	`121`	`+# Exporting our new Data Frame with full details as a .csv file`
	`122`	`+df2.to_csv('4Full_Details.csv', index=False, header=True)`
	`123`	`+`
	`124`	`+# Closing web browser`
	`125`	`+time.sleep(2)`
	`126`	`+driver.quit()`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 382602a

File tree

1 file changed

1 file changed

`‎Code/Get4Details.py`

0 commit comments