This repository was archived by the owner on Apr 17, 2023. It is now read-only.

Commit 3a1bd0d

committed

Add files via upload

1 parent fc9bd99 commit 3a1bd0dCopy full SHA for 3a1bd0d

File tree

1 file changed

+90

-0

lines changed

Week3
- scrapping.py

1 file changed

+90

-0

lines changed

`‎Week3/scrapping.py‎`

Lines changed: 90 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,90 @@`
	`1`	`+from numpy.core.defchararray import isdigit`
	`2`	`+from selenium import webdriver`
	`3`	`+from selenium.webdriver.common.by import By`
	`4`	`+from webdriver_manager.chrome import ChromeDriverManager`
	`5`	`+import pandas as pd`
	`6`	`+`
	`7`	`+# import time`
	`8`	`+`
	`9`	`+browser = webdriver.Chrome(ChromeDriverManager().install())`
	`10`	`+`
	`11`	`+`
	`12`	`+def contains_digits(temp):`
	`13`	`+ for ch in temp:`
	`14`	`+ if isdigit(ch):`
	`15`	`+ return True`
	`16`	`+ return False`
	`17`	`+`
	`18`	`+`
	`19`	`+dict = {}`
	`20`	`+initialised_dict = False`
	`21`	`+for day in range(20, 28):`
	`22`	`+ # if day == 21:`
	`23`	`+ # break`
	`24`	`+ browser.get(f"https://www.mai.gov.ro/informare-covid-19-grupul-de-comunicare-strategica-{day}-ianuarie-ora-13-00/")`
	`25`	`+ # table = browser.find_element(by=By.CLASS_NAME, value='//*[@class="entry-content"]')`
	`26`	`+ table = browser.find_element(by=By.XPATH, value="//table")`
	`27`	`+`
	`28`	`+ list = table.text.split('\n')`
	`29`	`+ list = list[1:43]`
	`30`	`+`
	`31`	`+ header_len = 5`
	`32`	`+`
	`33`	`+ # print(len(list[0]))`
	`34`	`+`
	`35`	`+ csv_list = []`
	`36`	`+ for string in list:`
	`37`	`+ separated = string.split(' ')`
	`38`	`+ # print(separated)`
	`39`	`+ # aux = []`
	`40`	`+ # aux.append(separated[0])`
	`41`	`+ # city = ""`
	`42`	`+ # idx = 1`
	`43`	`+ # num = ""`
	`44`	`+ # while True:`
	`45`	`+ # try:`
	`46`	`+ # num = int(separated[idx])`
	`47`	`+ # break`
	`48`	`+ # except:`
	`49`	`+ # city += num`
	`50`	`+ # idx = idx + 1`
	`51`	`+ # aux.append(city)`
	`52`	`+ # aux.append(num)`
	`53`	`+ # aux.append(separated[3:5])`
	`54`	`+ aux = []`
	`55`	`+ aux.append(separated[0])`
	`56`	`+ idx = 1`
	`57`	`+ city = ""`
	`58`	`+ while not (contains_digits(separated[idx])):`
	`59`	`+ city += separated[idx]`
	`60`	`+ idx = idx + 1`
	`61`	`+`
	`62`	`+ aux.append(city)`
	`63`	`+ mylen = len(separated)`
	`64`	`+ for index in range(idx, len(separated)):`
	`65`	`+ aux.append(separated[index])`
	`66`	`+`
	`67`	`+ csv_list.append(aux)`
	`68`	`+`
	`69`	`+ # print(csv_list)`
	`70`	`+`
	`71`	`+ headers = []`
	`72`	`+ for i in range(5):`
	`73`	`+ header_title = browser.find_element(by=By.XPATH, value=f'//table//td[{i + 1}]').text`
	`74`	`+ headers.append(header_title)`
	`75`	`+`
	`76`	`+ if initialised_dict == False:`
	`77`	`+ initialised_dict = True`
	`78`	`+ dict = {i: [] for i in headers}`
	`79`	`+`
	`80`	`+ for string in csv_list:`
	`81`	`+ for index in range(len(headers)):`
	`82`	`+ # print(len(headers))`
	`83`	`+ # print(len(string))`
	`84`	`+ dict[headers[index]].append(string[index])`
	`85`	`+`
	`86`	`+ # print(dict)`
	`87`	`+`
	`88`	`+df = pd.DataFrame(dict)`
	`89`	`+df.to_csv('ALL_DATA_GOV.csv')`
	`90`	`+browser.close()`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 3a1bd0d

File tree

1 file changed

1 file changed

`‎Week3/scrapping.py‎`

0 commit comments