|
| 1 | +################################################# |
| 2 | +### 2. GET THE LINKS OF EACH PAGE ### |
| 3 | +### OF EACH COUNTRY ### |
| 4 | +################################################# |
| 5 | + |
| 6 | +# Author of Code: Noam Shmuel & Lasha Gochiashvili |
| 7 | +# Load main packages and libraries |
| 8 | +from selenium import webdriver |
| 9 | +import pandas as pd |
| 10 | +import csv |
| 11 | +import time |
| 12 | + |
| 13 | +# Webdriver settings |
| 14 | +gecko_path = 'C:/Users/Lasha/anaconda3/geckodriver.exe' |
| 15 | + |
| 16 | +options = webdriver.firefox.options.Options() |
| 17 | +options.headless = False |
| 18 | +driver = webdriver.Firefox(options=options, executable_path=gecko_path) |
| 19 | + |
| 20 | +url = 'https://openaq.org/#/locations?parameters=pm25&_k=bmrxjw' |
| 21 | +driver.get(url) |
| 22 | +time.sleep(2) |
| 23 | + |
| 24 | +# This function opens .csv file that we created at the first stage |
| 25 | +# .csv file includes names of countries |
| 26 | +with open('1Countries.csv', newline='') as f: |
| 27 | + reader = csv.reader(f) |
| 28 | + list_of_countries = list(reader) |
| 29 | + list_of_countries = list_of_countries[0] |
| 30 | + print(list_of_countries) # printing a list of countries |
| 31 | + |
| 32 | +# Let's create Data Frame of the country & country_url |
| 33 | +df = pd.DataFrame(columns=['country', 'country_url']) |
| 34 | + |
| 35 | +# With this function we are generating urls for each country page |
| 36 | +for country in list_of_countries[:92]: |
| 37 | + try: |
| 38 | + path = ('//span[contains(text(),' + '\"' + country + '\"' + ')]') |
| 39 | + # "path" is used to filter each country on the website by |
| 40 | + # iterating country names. |
| 41 | + next_button = driver.find_element_by_xpath(path) |
| 42 | + next_button.click() |
| 43 | + # Using "button.click" we are get on the page of next country |
| 44 | + time.sleep(2) |
| 45 | + country_url = (driver.current_url) |
| 46 | + # "country_url" is used to get the url of the current page |
| 47 | + next_button.click() |
| 48 | + except: |
| 49 | + country_url = None |
| 50 | + |
| 51 | + d = [{'country': country, 'country_url': country_url}] |
| 52 | + df = df.append(d) |
| 53 | + # After getting urls of each country page, we are saving |
| 54 | + # in the Data Frame the we created above the function |
| 55 | + |
| 56 | +# Printing Data Frame |
| 57 | +print(df) |
| 58 | + |
| 59 | +# Saving created Data Frame in .csv file which will be used |
| 60 | +# at the third stage to get the links of the card on each |
| 61 | +# country pages |
| 62 | +df.to_csv('2Links_Of_Countries.csv', index=False, header=True) |
| 63 | + |
| 64 | +# Closing web browser |
| 65 | +time.sleep(4) |
| 66 | +driver.quit() |
0 commit comments