i made this python program to scrape and save the daily maximum pollution values in mexico city, the data will be used in machine learning but i wonder what could be improved:
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import bs4
import time
import csv
DRIVER_PATH = r"C:\Users\HP\Downloads\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
driver.implicitly_wait(60)
for i in range(3):
try:
driver.get("http://www.aire.cdmx.gob.mx/default.php?opc=%27aqBjnmU=%27")
break
except:
driver.navigate().refresh()
WebDriverWait(driver, 60).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH,'//*[@id="contenedorinformacion03"]/div/iframe')))
airquality=driver.find_element(By.XPATH,'//*[@id="sampleform"]/div[1]/div[1]/p[1]/input[2]')
airquality.click()
Select(WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[1]/td/table/tbody/tr[2]/td[4]/span/select')))).select_by_value("2010")
Select(WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[1]/td/table/tbody/tr[3]/td[2]/span/select')))).select_by_value("31")
Select(WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[1]/td/table/tbody/tr[3]/td[3]/span/select')))).select_by_value("12")
Select(WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[1]/td/table/tbody/tr[3]/td[4]/span/select')))).select_by_value("2021")
carbonmonoxide=driver.find_element(By.XPATH,'//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[4]/td/span/input[2]')
carbonmonoxide.click()
ozone=driver.find_element(By.XPATH,'//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[4]/td/span/input[4]')
ozone.click()
zoneclicker=driver.find_element(By.XPATH,'//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[7]/td/span/input[6]')
zoneclicker.click()
dataclicker=driver.find_element(By.XPATH,'//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[10]/td/span/input[2]')
dataclicker.click()
nextpage=driver.find_element(By.XPATH,'//*[@id="indice_aire_div"]/div[1]/table/tbody/tr/td/table/tbody/tr[12]/td/div/input')
nextpage.click()
time.sleep(30) #bad internet makes it slow
soup = bs4.BeautifulSoup(driver.page_source, features="lxml", parse_only=bs4.SoupStrainer("table"))
table = soup.find("table")
output_rows = []
for table_row in table.findAll('tr'):
columns = table_row.findAll('td')
output_row = []
for column in columns:
output_row.append(column.text)
output_rows.append(output_row)
with open('polution.csv', 'w') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(output_rows) ```
1 Answer 1
Don't use Selenium. Observe that the website sends the following POST http://www.aire.cdmx.gob.mx/estadisticas-consultas/consultas/resultado_consulta.php
with this request body:
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="tipo_attach"
b
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="diai"
31
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="mesi"
1
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="anoi"
2010
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="diaf"
12
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="mesf"
1
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="anof"
2021
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="CO"
on
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="O3"
on
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="TZ"
on
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="Q"
maximos
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="inter"
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="consulta"
Consulta
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="trip-start"
2022年08月18日
-----------------------------40642155113769355341724901495
Content-Disposition: form-data; name="trip-end"
2022年08月24日
-----------------------------40642155113769355341724901495--
Use the Requests package to construct such a request dynamically in a function with parameters for the date range, contaminants, zone and output type.
After that, I recommend that you use pandas.read_html to parse the output.
Suggested
from datetime import date
from requests import Session
def query(
session: Session,
start: date, end: date,
contaminants: tuple[str, ...],
zones: tuple[str, ...],
criteria: str,
# etc. - more params need to be reverse-engineered
) -> str:
with session.post(
url='http://www.aire.cdmx.gob.mx/estadisticas-consultas/consultas/resultado_consulta.php',
data={
'tipo_attach': 'b',
'diai': start.day, 'mesi': start.month, 'anoi': start.year,
'diaf': end.day, 'mesf': end.month, 'anof': end.year,
'Q': criteria, 'inter': '', 'consulta': 'Consulta',
**dict.fromkeys(contaminants + zones, 'on'),
}
) as resp:
resp.raise_for_status()
return resp.text
def main() -> None:
with Session() as session:
html = query(
session, start=date(2010, 1, 1), end=date(2021, 12, 31),
contaminants=('CO', 'O3'), zones=('TZ',), criteria='maximos',
)
print(html)
if __name__ == '__main__':
main()
-
\$\begingroup\$ How did you reverse engineered the parameters? The URL doesn't change, that's why I used selenium instead of requests \$\endgroup\$Omar Morales Rivera– Omar Morales Rivera2022年08月26日 04:19:55 +00:00Commented Aug 26, 2022 at 4:19
-
1\$\begingroup\$ Google "Chrome dev tools" \$\endgroup\$Reinderien– Reinderien2022年08月26日 13:18:25 +00:00Commented Aug 26, 2022 at 13:18
-
\$\begingroup\$ The URL not changing is a poor reason to choose Selenium; that's actually even more incentive to use Requests. But if there are parameters, include them in the code. In the real world, things don't change until they do. \$\endgroup\$Reinderien– Reinderien2022年08月26日 13:19:09 +00:00Commented Aug 26, 2022 at 13:19
Explore related questions
See similar questions with these tags.