Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 382602a

Browse files
Create Get4Details.py
1 parent 49fa1fd commit 382602a

File tree

1 file changed

+126
-0
lines changed

1 file changed

+126
-0
lines changed

‎Code/Get4Details.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
#################################################
2+
### 4. GET THE DETAILS FROM ###
3+
### OF EACH CARD ###
4+
#################################################
5+
6+
# NOTE: this code takes around 20 mins runtime
7+
# due to the number of pages to scrap.
8+
9+
# Authors of Code: Noam Shmuel & Lasha Gochiashvili
10+
11+
# Load main packages and libraries
12+
from selenium import webdriver
13+
import pandas as pd
14+
from selenium.webdriver.common.by import By
15+
from selenium.webdriver.common.keys import Keys
16+
from selenium.webdriver.support.ui import Select
17+
import time
18+
import numpy as np
19+
from selenium.webdriver.support.ui import WebDriverWait
20+
from selenium.webdriver.support import expected_conditions as EC
21+
22+
# Webdriver settings
23+
gecko_path = 'C:/Users/Lasha/anaconda3/geckodriver.exe'
24+
25+
options = webdriver.firefox.options.Options()
26+
options.headless = True
27+
28+
driver = webdriver.Firefox(options = options, executable_path = gecko_path)
29+
30+
'''
31+
By this function we will create a Data Frame to save full details
32+
of scrapping step by step. There are four parts of the function.
33+
'''
34+
def getCardDetails(country, url):
35+
### PART I
36+
# Declaring variables to save the results of scraping
37+
driver.get(url)
38+
local_df = pd.DataFrame(columns=['country','card_url','general','country_link','city', 'PM2.5','date','hour'])
39+
pm = None
40+
date = None
41+
hour = None
42+
general = None
43+
city = None
44+
country_link = None
45+
46+
try:
47+
#wait = WebDriverWait(driver, 3)
48+
#wait.until(EC.presence_of_element_located((By.ID, 'location-fold-stats')))
49+
time.sleep(2)
50+
51+
### PART II
52+
# Using Xpath we are getting the full text of the sibling that comes
53+
# after the text containing "PM2.5". We will split the full text to
54+
# generate variables for our Data Frame such as "pm", "date" & "hour".
55+
try:
56+
pm_date = driver.find_element(By.XPATH, '//dt[text() = "PM2.5"]/following-sibling::dd[1]').text
57+
# Scraping pollution details from each location page
58+
# and splitting them to save in the relevant variables
59+
text = pm_date.split('μg/m3 at ')
60+
pm = float(text[0])
61+
full_date = text[1].split(' ')
62+
date = full_date[0]
63+
hour = full_date[1]
64+
except:
65+
pm = None
66+
date = None
67+
hour = None
68+
69+
### PART III
70+
# Using class name we are getting the full text to generate variables
71+
# for our Data Frame such as "country", "card_url", "general", "city"
72+
# & "country_link".
73+
try:
74+
titles = driver.find_element_by_class_name('inpage__title').text
75+
# Scrapping location details and creating variables
76+
titles_split = titles.split('\n')
77+
general = titles_split[0]
78+
titles_split = titles_split[1].split('in ')[1].split(' ')
79+
city = titles_split[0]
80+
country_link = titles_split[1]
81+
except:
82+
general = None
83+
city = None
84+
country_link = None
85+
except:
86+
print ("Something went wrong with WAIT")
87+
88+
### PART IV
89+
# Saving each variables that we created into the Data Frame
90+
d = {'country':country,'card_url':url, 'general':general,'country_link':country_link,'city':city, 'PM2.5':pm,'date':date,'hour':hour }
91+
local_df = local_df.append(d, ignore_index=True)
92+
return (local_df)
93+
94+
time.sleep(2)
95+
96+
# Open the .csv file to use links in order to fill our new Data Frame
97+
# with all the necessary information
98+
df = pd.read_csv('3Links_Of_Cards.csv')
99+
df = df.dropna() # Remove NAs
100+
#print(df)
101+
time.sleep(2)
102+
103+
# Creating Data Frame and setting column names
104+
df2 = pd.DataFrame(columns=['country','card_url','general','country_link','city', 'PM2.5','date','hour'])
105+
106+
# Adding country, country_url and cardURL to the Data Frame
107+
for index, row in df.iterrows():
108+
myDf = pd.DataFrame(columns=['country','card_url','general','country_link','city', 'PM2.5','date','hour'])
109+
card_url = (row['cardURL'])
110+
country = (row['country'])
111+
time.sleep(1)
112+
myDf = getCardDetails(country, card_url)
113+
df2 = df2.append(myDf, ignore_index=True)
114+
if (index == 99): # We limit the iteration to 100 and not more
115+
break
116+
117+
# Printing our new Data Frame
118+
print("\n")
119+
print(df2)
120+
121+
# Exporting our new Data Frame with full details as a .csv file
122+
df2.to_csv('4Full_Details.csv', index=False, header=True)
123+
124+
# Closing web browser
125+
time.sleep(2)
126+
driver.quit()

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /