forked from rileycong/Covid_Scraper
-
Notifications
You must be signed in to change notification settings - Fork 1
/
country_info.py
59 lines (43 loc) · 1.75 KB
/
country_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from bs4 import BeautifulSoup
import requests
import pandas as pd
# link to the web
source = requests.get('https://www.worldometers.info/coronavirus/').text
soup = BeautifulSoup(source, 'lxml')
# Pick the table we want to scrape using id
covid_table = soup.find("table", attrs={"id": "main_table_countries_today"})
# The header's html
head = covid_table.thead.find_all('tr')
# Extract text headers from html to create list
headings = []
for th in head[0].find_all('th'):
headings.append(th.text.replace('\n','').strip()) # remove any newlines and extra spaces from left and right
# The body's html
body = covid_table.tbody.find_all('tr')
# Append the values of rows to create list
data = [] # A list that hold all rows
for a in range(1, len(body)):
row = []
for td in body[a].find_all('td'):
row.append(td.text.replace('\n','').strip())
data.append(row)
# data contains all the rows excluding header
# row contains data for one row
# Pass data into a pandas dataframe with headings as the columns
df = pd.DataFrame(data,columns=headings)
data = df[df['#']!=''].reset_index(drop=True)
# Data points with # value are the countries of the world while the data points with
# null values for # columns are features like continents totals etc
# Drop duplicates to only get today's data
data = data.drop_duplicates(subset = ["Country,Other"])
data = data.drop(211) #remove diamond princess
data = data.drop(221) #remove ms zaandam
data.reset_index(inplace=True) #Reset index
#sorting alphabetically
data.sort_values('Country,Other', ascending=True, inplace=True)
data.reset_index(inplace=True)
data.index = data.index + 1
# Select the country-related columns
def Country():
country_info = data[['Country,Other','Continent']]
return country_info