scraping pollution data tables from government website

Question 1

i made this python program to scrape and save the daily maximum pollution values in mexico city, the data will be used in machine learning but i wonder what could be improved:

from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import bs4
import time
import csv
DRIVER_PATH = r"C:\Users\HP\Downloads\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
driver.implicitly_wait(60)
for i in range(3):
 try:
 driver.get("http://www.aire.cdmx.gob.mx/default.php?opc=%27aqBjnmU=%27")
 break
 except:
 driver.navigate().refresh()
WebDriverWait(driver, 60).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH,'//*[@id="contenedorinformacion03"]/div/iframe')))
airquality=driver.find_element(By.XPATH,'//*[@id="sampleform"]/div[1]/div[1]/p[1]/input[2]')
airquality.click()
Select(WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[1]/td/table/tbody/tr[2]/td[4]/span/select')))).select_by_value("2010")
Select(WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[1]/td/table/tbody/tr[3]/td[2]/span/select')))).select_by_value("31")
Select(WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[1]/td/table/tbody/tr[3]/td[3]/span/select')))).select_by_value("12")
Select(WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[1]/td/table/tbody/tr[3]/td[4]/span/select')))).select_by_value("2021")
carbonmonoxide=driver.find_element(By.XPATH,'//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[4]/td/span/input[2]')
carbonmonoxide.click()
ozone=driver.find_element(By.XPATH,'//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[4]/td/span/input[4]')
ozone.click()
zoneclicker=driver.find_element(By.XPATH,'//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[7]/td/span/input[6]')
zoneclicker.click()
dataclicker=driver.find_element(By.XPATH,'//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[10]/td/span/input[2]')
dataclicker.click()
nextpage=driver.find_element(By.XPATH,'//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[12]/td/div/input')
nextpage.click()
time.sleep(30) #bad internet makes it slow
soup = bs4.BeautifulSoup(driver.page_source, features="lxml", parse_only=bs4.SoupStrainer("table"))
table = soup.find("table")
output_rows = []
for table_row in table.findAll('tr'):
 columns = table_row.findAll('td')
 output_row = []
 for column in columns:
 output_row.append(column.text)
 output_rows.append(output_row)
with open('polution.csv', 'w') as csvfile:
 writer = csv.writer(csvfile)
 writer.writerows(output_rows) ```

Question 2

Don't use Selenium. Observe that the website sends the following POST http://www.aire.cdmx.gob.mx/estadisticas-consultas/consultas/resultado_consulta.php with this request body:

-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="tipo_attach"
b
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="diai"
31
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="mesi"
1
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="anoi"
2010
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="diaf"
12
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="mesf"
1
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="anof"
2021
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="CO"
on
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="O3"
on
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="TZ"
on
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="Q"
maximos
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="inter"
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="consulta"
Consulta
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="trip-start"
2022年08月18日
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="trip-end"
2022年08月24日
-----------------------------40642155113769355341724901495--

Use the Requests package to construct such a request dynamically in a function with parameters for the date range, contaminants, zone and output type.

After that, I recommend that you use pandas.read_html to parse the output.

Suggested

from datetime import date
from requests import Session
def query(
 session: Session,
 start: date, end: date,
 contaminants: tuple[str, ...],
 zones: tuple[str, ...],
 criteria: str,
 # etc. - more params need to be reverse-engineered
) -> str:
 with session.post(
 url='http://www.aire.cdmx.gob.mx/estadisticas-consultas/consultas/resultado_consulta.php',
 data={
 'tipo_attach': 'b',
 'diai': start.day, 'mesi': start.month, 'anoi': start.year,
 'diaf': end.day, 'mesf': end.month, 'anof': end.year,
 'Q': criteria, 'inter': '', 'consulta': 'Consulta',
 **dict.fromkeys(contaminants + zones, 'on'),
 }
 ) as resp:
 resp.raise_for_status()
 return resp.text
def main() -> None:
 with Session() as session:
 html = query(
 session, start=date(2010, 1, 1), end=date(2021, 12, 31),
 contaminants=('CO', 'O3'), zones=('TZ',), criteria='maximos',
 )
 print(html)
if __name__ == '__main__':
 main()

Question 3

How did you reverse engineered the parameters? The URL doesn't change, that's why I used selenium instead of requests

Question 4

Google "Chrome dev tools"

Question 5

The URL not changing is a poor reason to choose Selenium; that's actually even more incentive to use Requests. But if there are parameters, include them in the code. In the real world, things don't change until they do.

Reinderien Reinderien 71k5 gold badges76 silver badges256 bronze badges · Accepted Answer · 2022-08-26 00:58:21Z

Don't use Selenium. Observe that the website sends the following POST http://www.aire.cdmx.gob.mx/estadisticas-consultas/consultas/resultado_consulta.php with this request body:

-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="tipo_attach"
b
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="diai"
31
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="mesi"
1
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="anoi"
2010
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="diaf"
12
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="mesf"
1
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="anof"
2021
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="CO"
on
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="O3"
on
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="TZ"
on
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="Q"
maximos
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="inter"
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="consulta"
Consulta
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="trip-start"
2022年08月18日
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="trip-end"
2022年08月24日
-----------------------------40642155113769355341724901495--

Use the Requests package to construct such a request dynamically in a function with parameters for the date range, contaminants, zone and output type.

After that, I recommend that you use pandas.read_html to parse the output.

Suggested

from datetime import date
from requests import Session
def query(
 session: Session,
 start: date, end: date,
 contaminants: tuple[str, ...],
 zones: tuple[str, ...],
 criteria: str,
 # etc. - more params need to be reverse-engineered
) -> str:
 with session.post(
 url='http://www.aire.cdmx.gob.mx/estadisticas-consultas/consultas/resultado_consulta.php',
 data={
 'tipo_attach': 'b',
 'diai': start.day, 'mesi': start.month, 'anoi': start.year,
 'diaf': end.day, 'mesf': end.month, 'anof': end.year,
 'Q': criteria, 'inter': '', 'consulta': 'Consulta',
 **dict.fromkeys(contaminants + zones, 'on'),
 }
 ) as resp:
 resp.raise_for_status()
 return resp.text
def main() -> None:
 with Session() as session:
 html = query(
 session, start=date(2010, 1, 1), end=date(2021, 12, 31),
 contaminants=('CO', 'O3'), zones=('TZ',), criteria='maximos',
 )
 print(html)
if __name__ == '__main__':
 main()

How did you reverse engineered the parameters? The URL doesn't change, that's why I used selenium instead of requests
The URL not changing is a poor reason to choose Selenium; that's actually even more incentive to use Requests. But if there are parameters, include them in the code. In the real world, things don't change until they do.

Stack Exchange Network

scraping pollution data tables from government website

1 Answer 1

Suggested

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

scraping pollution data tables from government website

1 Answer 1

Suggested

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions