Covid-19 Data Analysis with Python

Task 1. Import and Manipulate the Data

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
In [2]:
covid=pd.read_csv('Python_Projects_Datasets/Covid-19 Data Analysis with Python/00-covid19_Confirmed_dataset.csv')
covid
Out[2]:
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 4/21/20 4/22/20 4/23/20 4/24/20 4/25/20 4/26/20 4/27/20 4/28/20 4/29/20 4/30/20
0 NaN Afghanistan 33.000000 65.000000 0 0 0 0 0 0 ... 1092 1176 1279 1351 1463 1531 1703 1828 1939 2171
1 NaN Albania 41.153300 20.168300 0 0 0 0 0 0 ... 609 634 663 678 712 726 736 750 766 773
2 NaN Algeria 28.033900 1.659600 0 0 0 0 0 0 ... 2811 2910 3007 3127 3256 3382 3517 3649 3848 4006
3 NaN Andorra 42.506300 1.521800 0 0 0 0 0 0 ... 717 723 723 731 738 738 743 743 743 745
4 NaN Angola -11.202700 17.873900 0 0 0 0 0 0 ... 24 25 25 25 25 26 27 27 27 27
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
261 NaN Western Sahara 24.215500 -12.885800 0 0 0 0 0 0 ... 6 6 6 6 6 6 6 6 6 6
262 NaN Sao Tome and Principe 0.186360 6.613081 0 0 0 0 0 0 ... 4 4 4 4 4 4 4 8 8 14
263 NaN Yemen 15.552727 48.516388 0 0 0 0 0 0 ... 1 1 1 1 1 1 1 1 6 6
264 NaN Comoros -11.645500 43.333300 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
265 NaN Tajikistan 38.861034 71.276093 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 15

266 rows × 104 columns

Drop useless columns

In [3]:
covid.drop(['Lat', 'Long','Province/State'], axis=1, inplace=True)
covid
Out[3]:
Country/Region 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 1/28/20 1/29/20 1/30/20 ... 4/21/20 4/22/20 4/23/20 4/24/20 4/25/20 4/26/20 4/27/20 4/28/20 4/29/20 4/30/20
0 Afghanistan 0 0 0 0 0 0 0 0 0 ... 1092 1176 1279 1351 1463 1531 1703 1828 1939 2171
1 Albania 0 0 0 0 0 0 0 0 0 ... 609 634 663 678 712 726 736 750 766 773
2 Algeria 0 0 0 0 0 0 0 0 0 ... 2811 2910 3007 3127 3256 3382 3517 3649 3848 4006
3 Andorra 0 0 0 0 0 0 0 0 0 ... 717 723 723 731 738 738 743 743 743 745
4 Angola 0 0 0 0 0 0 0 0 0 ... 24 25 25 25 25 26 27 27 27 27
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
261 Western Sahara 0 0 0 0 0 0 0 0 0 ... 6 6 6 6 6 6 6 6 6 6
262 Sao Tome and Principe 0 0 0 0 0 0 0 0 0 ... 4 4 4 4 4 4 4 8 8 14
263 Yemen 0 0 0 0 0 0 0 0 0 ... 1 1 1 1 1 1 1 1 6 6
264 Comoros 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
265 Tajikistan 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 15

266 rows × 101 columns

Aggregate by Country

In [4]:
covid_country=covid.groupby('Country/Region').sum()
covid_country
Out[4]:
1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 1/28/20 1/29/20 1/30/20 1/31/20 ... 4/21/20 4/22/20 4/23/20 4/24/20 4/25/20 4/26/20 4/27/20 4/28/20 4/29/20 4/30/20
Country/Region
Afghanistan 0 0 0 0 0 0 0 0 0 0 ... 1092 1176 1279 1351 1463 1531 1703 1828 1939 2171
Albania 0 0 0 0 0 0 0 0 0 0 ... 609 634 663 678 712 726 736 750 766 773
Algeria 0 0 0 0 0 0 0 0 0 0 ... 2811 2910 3007 3127 3256 3382 3517 3649 3848 4006
Andorra 0 0 0 0 0 0 0 0 0 0 ... 717 723 723 731 738 738 743 743 743 745
Angola 0 0 0 0 0 0 0 0 0 0 ... 24 25 25 25 25 26 27 27 27 27
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
West Bank and Gaza 0 0 0 0 0 0 0 0 0 0 ... 466 474 480 484 342 342 342 343 344 344
Western Sahara 0 0 0 0 0 0 0 0 0 0 ... 6 6 6 6 6 6 6 6 6 6
Yemen 0 0 0 0 0 0 0 0 0 0 ... 1 1 1 1 1 1 1 1 6 6
Zambia 0 0 0 0 0 0 0 0 0 0 ... 70 74 76 84 84 88 88 95 97 106
Zimbabwe 0 0 0 0 0 0 0 0 0 0 ... 28 28 28 29 31 31 32 32 32 40

187 rows × 100 columns

Visualize data to Morocco

In [5]:
covid_country.loc['Morocco'].plot(color='#c1272d')
plt.legend()
Out[5]:
<matplotlib.legend.Legend at 0x1687812d0>
No description has been provided for this image

Task 2. Finding a Good Measure

Calculating a good measure

In [6]:
covid_country.loc['Morocco'].diff().plot(color='#c1272d')
Out[6]:
<Axes: >
No description has been provided for this image

Find max infection rate in Morocco

In [7]:
covid_country.loc['Morocco'].diff().max()
Out[7]:
281.0

Find max infection rate in all Countries

In [8]:
countries = list(covid_country.index)
max_infection_rates=[]
for c in countries :
    max_infection_rates.append(covid_country.loc[c].diff().max())
max_infection_rates
Out[8]:
[232.0,
 34.0,
 199.0,
 43.0,
 5.0,
 6.0,
 291.0,
 134.0,
 497.0,
 1321.0,
 105.0,
 7.0,
 301.0,
 641.0,
 12.0,
 1485.0,
 2454.0,
 4.0,
 19.0,
 1.0,
 104.0,
 92.0,
 7.0,
 7502.0,
 26.0,
 137.0,
 41.0,
 21.0,
 6.0,
 45.0,
 31.0,
 203.0,
 2778.0,
 31.0,
 21.0,
 1138.0,
 15136.0,
 353.0,
 1.0,
 57.0,
 81.0,
 37.0,
 113.0,
 96.0,
 63.0,
 58.0,
 381.0,
 391.0,
 99.0,
 156.0,
 5.0,
 371.0,
 11536.0,
 269.0,
 32.0,
 130.0,
 7.0,
 134.0,
 20.0,
 9.0,
 5.0,
 267.0,
 26849.0,
 38.0,
 5.0,
 42.0,
 6933.0,
 403.0,
 156.0,
 6.0,
 68.0,
 167.0,
 132.0,
 12.0,
 10.0,
 3.0,
 72.0,
 210.0,
 99.0,
 1893.0,
 436.0,
 3186.0,
 91.0,
 1515.0,
 1131.0,
 6557.0,
 52.0,
 1161.0,
 40.0,
 264.0,
 29.0,
 851.0,
 289.0,
 300.0,
 69.0,
 3.0,
 48.0,
 61.0,
 17.0,
 13.0,
 21.0,
 90.0,
 234.0,
 7.0,
 14.0,
 10.0,
 235.0,
 190.0,
 58.0,
 52.0,
 2.0,
 41.0,
 1425.0,
 222.0,
 12.0,
 13.0,
 30.0,
 281.0,
 19.0,
 3.0,
 14.0,
 1346.0,
 89.0,
 2.0,
 69.0,
 208.0,
 107.0,
 386.0,
 144.0,
 1292.0,
 357.0,
 5.0,
 27.0,
 3683.0,
 538.0,
 545.0,
 1516.0,
 957.0,
 523.0,
 7099.0,
 22.0,
 5.0,
 6.0,
 4.0,
 54.0,
 6.0,
 1351.0,
 87.0,
 2379.0,
 2.0,
 20.0,
 1426.0,
 114.0,
 70.0,
 73.0,
 354.0,
 28.0,
 9630.0,
 65.0,
 67.0,
 3.0,
 812.0,
 1321.0,
 6.0,
 27.0,
 15.0,
 181.0,
 188.0,
 10.0,
 14.0,
 40.0,
 82.0,
 5138.0,
 36188.0,
 11.0,
 578.0,
 552.0,
 8733.0,
 48.0,
 167.0,
 29.0,
 19.0,
 66.0,
 4.0,
 5.0,
 9.0,
 8.0]
In [9]:
countries = list(covid_country.index)
max_infection_rates=[]
for c in countries :
    max_infection_rates.append(covid_country.loc[c].diff().max())
covid_country['Max infection rate'] = max_infection_rates
covid_country
Out[9]:
1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 1/28/20 1/29/20 1/30/20 1/31/20 ... 4/22/20 4/23/20 4/24/20 4/25/20 4/26/20 4/27/20 4/28/20 4/29/20 4/30/20 Max infection rate
Country/Region
Afghanistan 0 0 0 0 0 0 0 0 0 0 ... 1176 1279 1351 1463 1531 1703 1828 1939 2171 232.0
Albania 0 0 0 0 0 0 0 0 0 0 ... 634 663 678 712 726 736 750 766 773 34.0
Algeria 0 0 0 0 0 0 0 0 0 0 ... 2910 3007 3127 3256 3382 3517 3649 3848 4006 199.0
Andorra 0 0 0 0 0 0 0 0 0 0 ... 723 723 731 738 738 743 743 743 745 43.0
Angola 0 0 0 0 0 0 0 0 0 0 ... 25 25 25 25 26 27 27 27 27 5.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
West Bank and Gaza 0 0 0 0 0 0 0 0 0 0 ... 474 480 484 342 342 342 343 344 344 66.0
Western Sahara 0 0 0 0 0 0 0 0 0 0 ... 6 6 6 6 6 6 6 6 6 4.0
Yemen 0 0 0 0 0 0 0 0 0 0 ... 1 1 1 1 1 1 1 6 6 5.0
Zambia 0 0 0 0 0 0 0 0 0 0 ... 74 76 84 84 88 88 95 97 106 9.0
Zimbabwe 0 0 0 0 0 0 0 0 0 0 ... 28 28 29 31 31 32 32 32 40 8.0

187 rows × 101 columns

Create a DataFrame with only the columns needed

In [10]:
covid_country_max = pd.DataFrame(covid_country['Max infection rate'])
covid_country_max
Out[10]:
Max infection rate
Country/Region
Afghanistan 232.0
Albania 34.0
Algeria 199.0
Andorra 43.0
Angola 5.0
... ...
West Bank and Gaza 66.0
Western Sahara 4.0
Yemen 5.0
Zambia 9.0
Zimbabwe 8.0

187 rows × 1 columns

Task 3. Importing and Preparing Happiness Data

Importing happiness dataset

In [11]:
happiness = pd.read_csv('Python_Projects_Datasets/Covid-19 Data Analysis with Python/00-worldwide_happiness_report.csv')
happiness
Out[11]:
Overall rank Country or region Score GDP per capita Social support Healthy life expectancy Freedom to make life choices Generosity Perceptions of corruption
0 1 Finland 7.769 1.340 1.587 0.986 0.596 0.153 0.393
1 2 Denmark 7.600 1.383 1.573 0.996 0.592 0.252 0.410
2 3 Norway 7.554 1.488 1.582 1.028 0.603 0.271 0.341
3 4 Iceland 7.494 1.380 1.624 1.026 0.591 0.354 0.118
4 5 Netherlands 7.488 1.396 1.522 0.999 0.557 0.322 0.298
... ... ... ... ... ... ... ... ... ...
151 152 Rwanda 3.334 0.359 0.711 0.614 0.555 0.217 0.411
152 153 Tanzania 3.231 0.476 0.885 0.499 0.417 0.276 0.147
153 154 Afghanistan 3.203 0.350 0.517 0.361 0.000 0.158 0.025
154 155 Central African Republic 3.083 0.026 0.000 0.105 0.225 0.235 0.035
155 156 South Sudan 2.853 0.306 0.575 0.295 0.010 0.202 0.091

156 rows × 9 columns

Dropping useless columns

In [12]:
happiness.drop(['Overall rank','Score','Generosity','Perceptions of corruption'], axis=1, inplace=True)
happiness
Out[12]:
Country or region GDP per capita Social support Healthy life expectancy Freedom to make life choices
0 Finland 1.340 1.587 0.986 0.596
1 Denmark 1.383 1.573 0.996 0.592
2 Norway 1.488 1.582 1.028 0.603
3 Iceland 1.380 1.624 1.026 0.591
4 Netherlands 1.396 1.522 0.999 0.557
... ... ... ... ... ...
151 Rwanda 0.359 0.711 0.614 0.555
152 Tanzania 0.476 0.885 0.499 0.417
153 Afghanistan 0.350 0.517 0.361 0.000
154 Central African Republic 0.026 0.000 0.105 0.225
155 South Sudan 0.306 0.575 0.295 0.010

156 rows × 5 columns

Changing the index

In [13]:
happiness.set_index('Country or region', inplace=True)
happiness
Out[13]:
GDP per capita Social support Healthy life expectancy Freedom to make life choices
Country or region
Finland 1.340 1.587 0.986 0.596
Denmark 1.383 1.573 0.996 0.592
Norway 1.488 1.582 1.028 0.603
Iceland 1.380 1.624 1.026 0.591
Netherlands 1.396 1.522 0.999 0.557
... ... ... ... ...
Rwanda 0.359 0.711 0.614 0.555
Tanzania 0.476 0.885 0.499 0.417
Afghanistan 0.350 0.517 0.361 0.000
Central African Republic 0.026 0.000 0.105 0.225
South Sudan 0.306 0.575 0.295 0.010

156 rows × 4 columns

Joining DataFrames

In [14]:
joined=covid_country_max.join(happiness, how='inner')
joined
Out[14]:
Max infection rate GDP per capita Social support Healthy life expectancy Freedom to make life choices
Afghanistan 232.0 0.350 0.517 0.361 0.000
Albania 34.0 0.947 0.848 0.874 0.383
Algeria 199.0 1.002 1.160 0.785 0.086
Argentina 291.0 1.092 1.432 0.881 0.471
Armenia 134.0 0.850 1.055 0.815 0.283
... ... ... ... ... ...
Venezuela 29.0 0.960 1.427 0.805 0.154
Vietnam 19.0 0.741 1.346 0.851 0.543
Yemen 5.0 0.287 1.163 0.463 0.143
Zambia 9.0 0.578 1.058 0.426 0.431
Zimbabwe 8.0 0.366 1.114 0.433 0.361

143 rows × 5 columns

Correlation Matrix

In [15]:
joined.corr()
Out[15]:
Max infection rate GDP per capita Social support Healthy life expectancy Freedom to make life choices
Max infection rate 1.000000 0.250118 0.191958 0.289263 0.078196
GDP per capita 0.250118 1.000000 0.759468 0.863062 0.394603
Social support 0.191958 0.759468 1.000000 0.765286 0.456246
Healthy life expectancy 0.289263 0.863062 0.765286 1.000000 0.427892
Freedom to make life choices 0.078196 0.394603 0.456246 0.427892 1.000000

Task 4. Visualizing the Results

Plotting GDP per Capita vs Max Infection Rate

In [16]:
sns.scatterplot(data=joined, x='GDP per capita', y='Max infection rate', color='#c1272d')
Out[16]:
<Axes: xlabel='GDP per capita', ylabel='Max infection rate'>
No description has been provided for this image

Log Scaling to get a better understanding of the data

In [17]:
sns.scatterplot(data=joined, x='GDP per capita', y=np.log(joined['Max infection rate']), color='#c1272d')
Out[17]:
<Axes: xlabel='GDP per capita', ylabel='Max infection rate'>
No description has been provided for this image

Plotting Trendline

In [18]:
sns.regplot(data=joined, x='GDP per capita', y=np.log(joined['Max infection rate']), color='#c1272d')
Out[18]:
<Axes: xlabel='GDP per capita', ylabel='Max infection rate'>
No description has been provided for this image

Plotting Trendline for GDP per capita vs Max Infection Rate, Social support vs Max Infection Rate, and Healthy life expectancy vs Max Infection Rate

In [19]:
fig, axes = plt.subplots(3, 1, figsize=(6, 14))

sns.regplot(data=joined, x='GDP per capita', y=np.log(joined['Max infection rate']), ax=axes[0], color='#c1272d')
axes[0].set_title('GDP per capita')

sns.regplot(data=joined, x='Social support', y=np.log(joined['Max infection rate']), ax=axes[1], color='#006233')
axes[1].set_title('Social support')

sns.regplot(data=joined, x='Healthy life expectancy', y=np.log(joined['Max infection rate']), ax=axes[2], color='#333333')
axes[2].set_title('Healthy life expectancy')

plt.tight_layout()
plt.show()
No description has been provided for this image