|
| 1 | +################################################# |
| 2 | +### 5. ANALYSING OF POLLUTION LEVELS ### |
| 3 | +### IN EU COUNTRIES IN 2020 BASED ON ### |
| 4 | +### THE DATA SCRAPPED ### |
| 5 | +################################################# |
| 6 | + |
| 7 | +# Authors of Code: Lasha Gochiashvili, Noam Shmuel |
| 8 | +# & Jorge Bueno Perez |
| 9 | + |
| 10 | +# Load main packages and libraries |
| 11 | +import numpy as np |
| 12 | +import seaborn as sns |
| 13 | +import matplotlib.pyplot as plt |
| 14 | +import pandas as pd |
| 15 | +import datetime |
| 16 | +import time |
| 17 | +from time import gmtime, strftime |
| 18 | +pd.set_option("display.precision", 2) |
| 19 | + |
| 20 | +''' |
| 21 | +In this code we will analyze data that we generated in previous stages. |
| 22 | +We want to analyze pollution levels of EU countries in 2020 based on the |
| 23 | +data that we generated during scrapping from the website: www.openaq.org |
| 24 | + |
| 25 | +Firstly, we will load results of scrapping as a Data Frame. Then we will |
| 26 | +apply data manipulation to clean data. In particular: |
| 27 | +-- removing unnecessary columns |
| 28 | +-- removing records that have PM 2.5 as 0 or none |
| 29 | +-- renaming columns |
| 30 | +-- removing data of non-EU countries |
| 31 | + |
| 32 | +Then we will create boxplot from Seaborn library. |
| 33 | +And finally we will save boxplot as a .png. |
| 34 | +''' |
| 35 | + |
| 36 | +# Loading results of scrapping as a Data Frame |
| 37 | +df = pd.read_csv('4Full_Details.csv') |
| 38 | +df = df.dropna() |
| 39 | +time.sleep(2) |
| 40 | + |
| 41 | +# Removing unnecessary columns from the table |
| 42 | +df.drop(['card_url', 'general', 'city', 'hour', 'country_link'], axis = 1, inplace = True) |
| 43 | + |
| 44 | +time.sleep(2) |
| 45 | + |
| 46 | +# Applying filters to have only records with none zero or non empty |
| 47 | +# records on PM2.5 fields. |
| 48 | +df = df[df['PM2.5'] > 0] |
| 49 | + |
| 50 | +# Leaving only 2020 data |
| 51 | +df = df[df['date'] > '2020年01月01日'] |
| 52 | + |
| 53 | +# Renaming columns to make it easily readable |
| 54 | +df.rename(columns={'country':'Country','PM2.5':'Pollution'}, inplace=True) |
| 55 | + |
| 56 | +# Filtering to leave data only for EU countries |
| 57 | +eu_countries = ["Austria", "Belgium", "Czech Republic", "Denmark", "Estonia", "Finland", |
| 58 | + "France", "Germany", "Greece", "Hungary", "Iceland", "Italy", "Latvia", |
| 59 | + "Liechtenstein", "Lithuania", "Luxembourg", "Malta", "Netherlands", |
| 60 | + "Norway", "Poland", "Slovakia", "Portugal", "Slovenia", "Spain", "Sweden", "Switzerland"] |
| 61 | + |
| 62 | +df_eu = df[df['Country'].isin(eu_countries)] |
| 63 | + |
| 64 | +# Creating boxplot from Seaborn library |
| 65 | +sns.set(style='ticks', palette='muted', color_codes=True) |
| 66 | +plt.figure(figsize=(18, 12)) |
| 67 | +ax = sns.boxplot(x ='Pollution', y = 'Country', data = df_eu, color = "c") |
| 68 | +ax.set_title("Pollution level in EU Countries in 2020", fontsize=30) |
| 69 | +sns_plot = sns.stripplot(x = 'Pollution', y = "Country", data=df_eu, jitter=False, size=5, color='.3', linewidth=1) |
| 70 | +ax.set_xlabel("Pollution (PM 2.5)",fontsize=15) |
| 71 | +ax.set_ylabel("EU Countries",fontsize=15) |
| 72 | +sns.despine(trim=True) |
| 73 | + |
| 74 | +time.sleep(2) |
| 75 | + |
| 76 | +# Settings for exporting the boxplot as .png file |
| 77 | +time = strftime("%Y-%m-%d %H.%M", gmtime()) |
| 78 | +fig = sns_plot.get_figure() |
| 79 | +fig.savefig("5pollution_european_countries." + time + ".png") |
0 commit comments