Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings
This repository was archived by the owner on Apr 17, 2023. It is now read-only.

Commit 3a1bd0d

Browse files
Add files via upload
1 parent fc9bd99 commit 3a1bd0d

File tree

1 file changed

+90
-0
lines changed

1 file changed

+90
-0
lines changed

‎Week3/scrapping.py‎

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
from numpy.core.defchararray import isdigit
2+
from selenium import webdriver
3+
from selenium.webdriver.common.by import By
4+
from webdriver_manager.chrome import ChromeDriverManager
5+
import pandas as pd
6+
7+
# import time
8+
9+
browser = webdriver.Chrome(ChromeDriverManager().install())
10+
11+
12+
def contains_digits(temp):
13+
for ch in temp:
14+
if isdigit(ch):
15+
return True
16+
return False
17+
18+
19+
dict = {}
20+
initialised_dict = False
21+
for day in range(20, 28):
22+
# if day == 21:
23+
# break
24+
browser.get(f"https://www.mai.gov.ro/informare-covid-19-grupul-de-comunicare-strategica-{day}-ianuarie-ora-13-00/")
25+
# table = browser.find_element(by=By.CLASS_NAME, value='//*[@class="entry-content"]')
26+
table = browser.find_element(by=By.XPATH, value="//table")
27+
28+
list = table.text.split('\n')
29+
list = list[1:43]
30+
31+
header_len = 5
32+
33+
# print(len(list[0]))
34+
35+
csv_list = []
36+
for string in list:
37+
separated = string.split(' ')
38+
# print(separated)
39+
# aux = []
40+
# aux.append(separated[0])
41+
# city = ""
42+
# idx = 1
43+
# num = ""
44+
# while True:
45+
# try:
46+
# num = int(separated[idx])
47+
# break
48+
# except:
49+
# city += num
50+
# idx = idx + 1
51+
# aux.append(city)
52+
# aux.append(num)
53+
# aux.append(separated[3:5])
54+
aux = []
55+
aux.append(separated[0])
56+
idx = 1
57+
city = ""
58+
while not (contains_digits(separated[idx])):
59+
city += separated[idx]
60+
idx = idx + 1
61+
62+
aux.append(city)
63+
mylen = len(separated)
64+
for index in range(idx, len(separated)):
65+
aux.append(separated[index])
66+
67+
csv_list.append(aux)
68+
69+
# print(csv_list)
70+
71+
headers = []
72+
for i in range(5):
73+
header_title = browser.find_element(by=By.XPATH, value=f'//table//td[{i + 1}]').text
74+
headers.append(header_title)
75+
76+
if initialised_dict == False:
77+
initialised_dict = True
78+
dict = {i: [] for i in headers}
79+
80+
for string in csv_list:
81+
for index in range(len(headers)):
82+
# print(len(headers))
83+
# print(len(string))
84+
dict[headers[index]].append(string[index])
85+
86+
# print(dict)
87+
88+
df = pd.DataFrame(dict)
89+
df.to_csv('ALL_DATA_GOV.csv')
90+
browser.close()

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /