2
\$\begingroup\$

i made this python program to scrape and save the daily maximum pollution values in mexico city, the data will be used in machine learning but i wonder what could be improved:

from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import bs4
import time
import csv
DRIVER_PATH = r"C:\Users\HP\Downloads\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
driver.implicitly_wait(60)
for i in range(3):
 try:
 driver.get("http://www.aire.cdmx.gob.mx/default.php?opc=%27aqBjnmU=%27")
 break
 except:
 driver.navigate().refresh()
WebDriverWait(driver, 60).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH,'//*[@id="contenedorinformacion03"]/div/iframe')))
airquality=driver.find_element(By.XPATH,'//*[@id="sampleform"]/div[1]/div[1]/p[1]/input[2]')
airquality.click()
Select(WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[1]/td/table/tbody/tr[2]/td[4]/span/select')))).select_by_value("2010")
Select(WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[1]/td/table/tbody/tr[3]/td[2]/span/select')))).select_by_value("31")
Select(WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[1]/td/table/tbody/tr[3]/td[3]/span/select')))).select_by_value("12")
Select(WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[1]/td/table/tbody/tr[3]/td[4]/span/select')))).select_by_value("2021")
carbonmonoxide=driver.find_element(By.XPATH,'//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[4]/td/span/input[2]')
carbonmonoxide.click()
ozone=driver.find_element(By.XPATH,'//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[4]/td/span/input[4]')
ozone.click()
zoneclicker=driver.find_element(By.XPATH,'//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[7]/td/span/input[6]')
zoneclicker.click()
dataclicker=driver.find_element(By.XPATH,'//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[10]/td/span/input[2]')
dataclicker.click()
nextpage=driver.find_element(By.XPATH,'//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[12]/td/div/input')
nextpage.click()
time.sleep(30) #bad internet makes it slow
soup = bs4.BeautifulSoup(driver.page_source, features="lxml", parse_only=bs4.SoupStrainer("table"))
table = soup.find("table")
output_rows = []
for table_row in table.findAll('tr'):
 columns = table_row.findAll('td')
 output_row = []
 for column in columns:
 output_row.append(column.text)
 output_rows.append(output_row)
with open('polution.csv', 'w') as csvfile:
 writer = csv.writer(csvfile)
 writer.writerows(output_rows) ```
asked Aug 24, 2022 at 20:22
\$\endgroup\$

1 Answer 1

1
\$\begingroup\$

Don't use Selenium. Observe that the website sends the following POST http://www.aire.cdmx.gob.mx/estadisticas-consultas/consultas/resultado_consulta.php with this request body:

-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="tipo_attach"
b
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="diai"
31
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="mesi"
1
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="anoi"
2010
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="diaf"
12
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="mesf"
1
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="anof"
2021
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="CO"
on
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="O3"
on
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="TZ"
on
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="Q"
maximos
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="inter"
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="consulta"
Consulta
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="trip-start"
2022年08月18日
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="trip-end"
2022年08月24日
-----------------------------40642155113769355341724901495--

Use the Requests package to construct such a request dynamically in a function with parameters for the date range, contaminants, zone and output type.

After that, I recommend that you use pandas.read_html to parse the output.

Suggested

from datetime import date
from requests import Session
def query(
 session: Session,
 start: date, end: date,
 contaminants: tuple[str, ...],
 zones: tuple[str, ...],
 criteria: str,
 # etc. - more params need to be reverse-engineered
) -> str:
 with session.post(
 url='http://www.aire.cdmx.gob.mx/estadisticas-consultas/consultas/resultado_consulta.php',
 data={
 'tipo_attach': 'b',
 'diai': start.day, 'mesi': start.month, 'anoi': start.year,
 'diaf': end.day, 'mesf': end.month, 'anof': end.year,
 'Q': criteria, 'inter': '', 'consulta': 'Consulta',
 **dict.fromkeys(contaminants + zones, 'on'),
 }
 ) as resp:
 resp.raise_for_status()
 return resp.text
def main() -> None:
 with Session() as session:
 html = query(
 session, start=date(2010, 1, 1), end=date(2021, 12, 31),
 contaminants=('CO', 'O3'), zones=('TZ',), criteria='maximos',
 )
 print(html)
if __name__ == '__main__':
 main()
answered Aug 26, 2022 at 0:58
\$\endgroup\$
3
  • \$\begingroup\$ How did you reverse engineered the parameters? The URL doesn't change, that's why I used selenium instead of requests \$\endgroup\$ Commented Aug 26, 2022 at 4:19
  • 1
    \$\begingroup\$ Google "Chrome dev tools" \$\endgroup\$ Commented Aug 26, 2022 at 13:18
  • \$\begingroup\$ The URL not changing is a poor reason to choose Selenium; that's actually even more incentive to use Requests. But if there are parameters, include them in the code. In the real world, things don't change until they do. \$\endgroup\$ Commented Aug 26, 2022 at 13:19

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.