Skip to content
This repository was archived by the owner on Apr 17, 2023. It is now read-only.

Commit 3a1bd0d

Browse files
committed
Add files via upload
1 parent fc9bd99 commit 3a1bd0d

File tree

1 file changed

+90
-0
lines changed

1 file changed

+90
-0
lines changed

Week3/scrapping.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
from numpy.core.defchararray import isdigit
2+
from selenium import webdriver
3+
from selenium.webdriver.common.by import By
4+
from webdriver_manager.chrome import ChromeDriverManager
5+
import pandas as pd
6+
7+
# import time
8+
9+
browser = webdriver.Chrome(ChromeDriverManager().install())
10+
11+
12+
def contains_digits(temp):
13+
for ch in temp:
14+
if isdigit(ch):
15+
return True
16+
return False
17+
18+
19+
dict = {}
20+
initialised_dict = False
21+
for day in range(20, 28):
22+
# if day == 21:
23+
# break
24+
browser.get(f"https://www.mai.gov.ro/informare-covid-19-grupul-de-comunicare-strategica-{day}-ianuarie-ora-13-00/")
25+
# table = browser.find_element(by=By.CLASS_NAME, value='//*[@class="entry-content"]')
26+
table = browser.find_element(by=By.XPATH, value="//table")
27+
28+
list = table.text.split('\n')
29+
list = list[1:43]
30+
31+
header_len = 5
32+
33+
# print(len(list[0]))
34+
35+
csv_list = []
36+
for string in list:
37+
separated = string.split(' ')
38+
# print(separated)
39+
# aux = []
40+
# aux.append(separated[0])
41+
# city = ""
42+
# idx = 1
43+
# num = ""
44+
# while True:
45+
# try:
46+
# num = int(separated[idx])
47+
# break
48+
# except:
49+
# city += num
50+
# idx = idx + 1
51+
# aux.append(city)
52+
# aux.append(num)
53+
# aux.append(separated[3:5])
54+
aux = []
55+
aux.append(separated[0])
56+
idx = 1
57+
city = ""
58+
while not (contains_digits(separated[idx])):
59+
city += separated[idx]
60+
idx = idx + 1
61+
62+
aux.append(city)
63+
mylen = len(separated)
64+
for index in range(idx, len(separated)):
65+
aux.append(separated[index])
66+
67+
csv_list.append(aux)
68+
69+
# print(csv_list)
70+
71+
headers = []
72+
for i in range(5):
73+
header_title = browser.find_element(by=By.XPATH, value=f'//table//td[{i + 1}]').text
74+
headers.append(header_title)
75+
76+
if initialised_dict == False:
77+
initialised_dict = True
78+
dict = {i: [] for i in headers}
79+
80+
for string in csv_list:
81+
for index in range(len(headers)):
82+
# print(len(headers))
83+
# print(len(string))
84+
dict[headers[index]].append(string[index])
85+
86+
# print(dict)
87+
88+
df = pd.DataFrame(dict)
89+
df.to_csv('ALL_DATA_GOV.csv')
90+
browser.close()

0 commit comments

Comments
 (0)