import warnings
warnings.filterwarnings("ignore")

import pandas as pd

df=pd.read_csv('/Users/mekki/Python_Projects_Datasets/Practical Data Wrangling with Pandas/00-Life_Expectancy_Data.csv')
df.head()

df.shape

(2938, 21)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Year                             2938 non-null   int64  
 1   Status                           2938 non-null   object 
 2   Life expectancy                  2928 non-null   float64
 3   Adult Mortality                  2928 non-null   float64
 4   infant deaths                    2938 non-null   int64  
 5   Alcohol                          2744 non-null   float64
 6   percentage expenditure           2938 non-null   float64
 7   Hepatitis B                      2385 non-null   float64
 8   Measles                          2938 non-null   int64  
 9    BMI                             2904 non-null   float64
 10  under-five deaths                2938 non-null   int64  
 11  Polio                            2919 non-null   float64
 12  Total expenditure                2712 non-null   float64
 13  Diphtheria                       2919 non-null   float64
 14   HIV/AIDS                        2938 non-null   float64
 15  GDP                              2490 non-null   float64
 16  Population                       2286 non-null   float64
 17   thinness  1-19 years            2904 non-null   float64
 18   thinness 5-9 years              2904 non-null   float64
 19  Income composition of resources  2771 non-null   float64
 20  Schooling                        2775 non-null   float64
dtypes: float64(16), int64(4), object(1)
memory usage: 482.1+ KB

df.isnull().sum().sum()

2563

df.isnull().sum()

Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64

df.index

RangeIndex(start=0, stop=2938, step=1)

df.dtypes.index

Index(['Year', 'Status', 'Life expectancy ', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure',
       'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population',
       ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling'],
      dtype='object')

type(df)

pandas.core.frame.DataFrame

round(df.describe(),2)

nc = df.select_dtypes(include=['number'])
round(nc.mean())

Year                                   2008.0
Life expectancy                          69.0
Adult Mortality                         165.0
infant deaths                            30.0
Alcohol                                   5.0
percentage expenditure                  738.0
Hepatitis B                              81.0
Measles                                2420.0
 BMI                                     38.0
under-five deaths                        42.0
Polio                                    83.0
Total expenditure                         6.0
Diphtheria                               82.0
 HIV/AIDS                                 2.0
GDP                                    7483.0
Population                         12753375.0
 thinness  1-19 years                     5.0
 thinness 5-9 years                       5.0
Income composition of resources           1.0
Schooling                                12.0
dtype: float64

round(df['GDP'].describe(),2)

count      2490.00
mean       7483.16
std       14270.17
min           1.68
25%         463.94
50%        1766.95
75%        5910.81
max      119172.74
Name: GDP, dtype: float64

round(df['GDP'].describe().loc[['mean','max','min']],2)

mean      7483.16
max     119172.74
min          1.68
Name: GDP, dtype: float64

import seaborn as sns

df

df.isnull()

df.isnull().sum()

Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64

df.isnull().sum().sum()

2563

from matplotlib.colors import ListedColormap
custom_colors = ['#675A6E', '#9A87A5', '#CDB4DB', '#E6DAED', '#F3EDF6']
custom_cmap = ListedColormap(custom_colors)
sns.heatmap(df.isnull(),yticklabels=False, cbar=True,cmap=custom_cmap)

<Axes: >

#drop any row that conatins a Null Values
df.dropna(how='any',inplace=True)
df

df.isnull().sum().sum()

0

#Reloading the df
df=pd.read_csv('/Users/mekki/Python_Projects_Datasets/Practical Data Wrangling with Pandas/00-Life_Expectancy_Data.csv')

#Finding number of total null values
df.isnull().sum().sum()

2563

#Finding number of null values in GDP
df['GDP'].isnull().sum().sum()

448

#Finding GDP mean
df['GDP'].mean()

7483.158469138474

#Replace GDP by its mean
df['GDP'].fillna(df['GDP'].mean(), inplace=True)

df['GDP'].isnull().sum().sum()

0

#Plotting heatmap to check GDP column
sns.heatmap(df.isnull(),yticklabels=False, cbar=True,cmap="Reds")

<Axes: >

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Year                             2938 non-null   int64  
 1   Status                           2938 non-null   object 
 2   Life expectancy                  2928 non-null   float64
 3   Adult Mortality                  2928 non-null   float64
 4   infant deaths                    2938 non-null   int64  
 5   Alcohol                          2744 non-null   float64
 6   percentage expenditure           2938 non-null   float64
 7   Hepatitis B                      2385 non-null   float64
 8   Measles                          2938 non-null   int64  
 9    BMI                             2904 non-null   float64
 10  under-five deaths                2938 non-null   int64  
 11  Polio                            2919 non-null   float64
 12  Total expenditure                2712 non-null   float64
 13  Diphtheria                       2919 non-null   float64
 14   HIV/AIDS                        2938 non-null   float64
 15  GDP                              2938 non-null   float64
 16  Population                       2286 non-null   float64
 17   thinness  1-19 years            2904 non-null   float64
 18   thinness 5-9 years              2904 non-null   float64
 19  Income composition of resources  2771 non-null   float64
 20  Schooling                        2775 non-null   float64
dtypes: float64(16), int64(4), object(1)
memory usage: 482.1+ KB

df['Total expenditure'].median()

5.755

df['Total expenditure'].isnull().sum()

226

#Replace Total expenditure by its median
df['Total expenditure'].fillna(df['Total expenditure'].median(), inplace=True)

df['Total expenditure'].isnull().sum()

0

#Convert categorical data to numerical data

df

df['Status'].unique()

array(['Developing', 'Developed'], dtype=object)

Status_Encoded = pd.get_dummies(df['Status']).astype(int)

Status_Encoded

#Checking for Developed Countries
Status_Encoded.query('Developed == 1').head(5)

#We need to perform Feature Scaling

from sklearn.preprocessing import MinMaxScaler

#Reloading the df
df=pd.read_csv('/Users/mekki/Python_Projects_Datasets/Practical Data Wrangling with Pandas/00-Life_Expectancy_Data.csv')

df['Life expectancy '].values

array([65. , 59.9, 59.9, ..., 44.8, 45.3, 46. ])

scaler=MinMaxScaler()
df['Life expectancy ']=scaler.fit_transform(df['Life expectancy '].values.reshape(-1,1))

df['Life expectancy '].values

array([0.54459203, 0.44781784, 0.44781784, ..., 0.16129032, 0.17077799,
       0.18406072])

round(df['Life expectancy '].describe())

count    2928.0
mean        1.0
std         0.0
min         0.0
25%         1.0
50%         1.0
75%         1.0
max         1.0
Name: Life expectancy , dtype: float64

#Mean of 0 and STD of 1

from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()

#Reloading the df
df=pd.read_csv('/Users/mekki/Python_Projects_Datasets/Practical Data Wrangling with Pandas/00-Life_Expectancy_Data.csv')

df['Life expectancy '].values

array([65. , 59.9, 59.9, ..., 44.8, 45.3, 46. ])

df['Life expectancy ']=scaler.fit_transform(df['Life expectancy '].values.reshape(-1,1))

df['Life expectancy '].values

array([-0.4436909 , -0.97927911, -0.97927911, ..., -2.56504028,
       -2.51253163, -2.43901952])

round(df['Life expectancy '].describe())

count    2928.0
mean       -0.0
std         1.0
min        -3.0
25%        -1.0
50%         0.0
75%         1.0
max         2.0
Name: Life expectancy , dtype: float64

#Reloading the df
df=pd.read_csv('/Users/mekki/Python_Projects_Datasets/Practical Data Wrangling with Pandas/00-Life_Expectancy_Data.csv')

df

#Define a function
def percentage_expenditure_update(balance):
    return balance + 5

df['percentage expenditure']=df['percentage expenditure'].apply(percentage_expenditure_update)
df

	Year	Status	Life expectancy	Adult Mortality	infant deaths	Alcohol	percentage expenditure	Hepatitis B	Measles	BMI	...	Polio	Total expenditure	Diphtheria	HIV/AIDS	GDP	Population	thinness 1-19 years	thinness 5-9 years	Income composition of resources	Schooling
0	2015	Developing	65.0	263.0	62	0.01	71.279624	65.0	1154	19.1	...	6.0	8.16	65.0	0.1	584.259210	33736494.0	17.2	17.3	0.479	10.1
1	2014	Developing	59.9	271.0	64	0.01	73.523582	62.0	492	18.6	...	58.0	8.18	62.0	0.1	612.696514	327582.0	17.5	17.5	0.476	10.0
2	2013	Developing	59.9	268.0	66	0.01	73.219243	64.0	430	18.1	...	62.0	8.13	64.0	0.1	631.744976	31731688.0	17.7	17.7	0.470	9.9
3	2012	Developing	59.5	272.0	69	0.01	78.184215	67.0	2787	17.6	...	67.0	8.52	67.0	0.1	669.959000	3696958.0	17.9	18.0	0.463	9.8
4	2011	Developing	59.2	275.0	71	0.01	7.097109	68.0	3013	17.2	...	68.0	7.87	68.0	0.1	63.537231	2978599.0	18.2	18.2	0.454	9.5

	Year	Life expectancy	Adult Mortality	infant deaths	Alcohol	percentage expenditure	Hepatitis B	Measles	BMI	under-five deaths	Polio	Total expenditure	Diphtheria	HIV/AIDS	GDP	Population	thinness 1-19 years	thinness 5-9 years	Income composition of resources	Schooling
count	2938.00	2928.00	2928.00	2938.00	2744.00	2938.00	2385.00	2938.00	2904.00	2938.00	2919.00	2712.00	2919.00	2938.00	2490.00	2.286000e+03	2904.00	2904.00	2771.00	2775.00
mean	2007.52	69.22	164.80	30.30	4.60	738.25	80.94	2419.59	38.32	42.04	82.55	5.94	82.32	1.74	7483.16	1.275338e+07	4.84	4.87	0.63	11.99
std	4.61	9.52	124.29	117.93	4.05	1987.91	25.07	11467.27	20.04	160.45	23.43	2.50	23.72	5.08	14270.17	6.101210e+07	4.42	4.51	0.21	3.36
min	2000.00	36.30	1.00	0.00	0.01	0.00	1.00	0.00	1.00	0.00	3.00	0.37	2.00	0.10	1.68	3.400000e+01	0.10	0.10	0.00	0.00
25%	2004.00	63.10	74.00	0.00	0.88	4.69	77.00	0.00	19.30	0.00	78.00	4.26	78.00	0.10	463.94	1.957932e+05	1.60	1.50	0.49	10.10
50%	2008.00	72.10	144.00	3.00	3.76	64.91	92.00	17.00	43.50	4.00	93.00	5.76	93.00	0.10	1766.95	1.386542e+06	3.30	3.30	0.68	12.30
75%	2012.00	75.70	228.00	22.00	7.70	441.53	97.00	360.25	56.20	28.00	97.00	7.49	97.00	0.80	5910.81	7.420359e+06	7.20	7.20	0.78	14.30
max	2015.00	89.00	723.00	1800.00	17.87	19479.91	99.00	212183.00	87.30	2500.00	99.00	17.60	99.00	50.60	119172.74	1.293859e+09	27.70	28.60	0.95	20.70

	Year	Status	Life expectancy	Adult Mortality	infant deaths	Alcohol	percentage expenditure	Hepatitis B	Measles	BMI	...	Polio	Total expenditure	Diphtheria	HIV/AIDS	GDP	Population	thinness 1-19 years	thinness 5-9 years	Income composition of resources	Schooling
0	2015	Developing	65.0	263.0	62	0.01	71.279624	65.0	1154	19.1	...	6.0	8.16	65.0	0.1	584.259210	33736494.0	17.2	17.3	0.479	10.1
1	2014	Developing	59.9	271.0	64	0.01	73.523582	62.0	492	18.6	...	58.0	8.18	62.0	0.1	612.696514	327582.0	17.5	17.5	0.476	10.0
2	2013	Developing	59.9	268.0	66	0.01	73.219243	64.0	430	18.1	...	62.0	8.13	64.0	0.1	631.744976	31731688.0	17.7	17.7	0.470	9.9
3	2012	Developing	59.5	272.0	69	0.01	78.184215	67.0	2787	17.6	...	67.0	8.52	67.0	0.1	669.959000	3696958.0	17.9	18.0	0.463	9.8
4	2011	Developing	59.2	275.0	71	0.01	7.097109	68.0	3013	17.2	...	68.0	7.87	68.0	0.1	63.537231	2978599.0	18.2	18.2	0.454	9.5
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2933	2004	Developing	44.3	723.0	27	4.36	0.000000	68.0	31	27.1	...	67.0	7.13	65.0	33.6	454.366654	12777511.0	9.4	9.4	0.407	9.2
2934	2003	Developing	44.5	715.0	26	4.06	0.000000	7.0	998	26.7	...	7.0	6.52	68.0	36.7	453.351155	12633897.0	9.8	9.9	0.418	9.5
2935	2002	Developing	44.8	73.0	25	4.43	0.000000	73.0	304	26.3	...	73.0	6.53	71.0	39.8	57.348340	125525.0	1.2	1.3	0.427	10.0
2936	2001	Developing	45.3	686.0	25	1.72	0.000000	76.0	529	25.9	...	76.0	6.16	75.0	42.1	548.587312	12366165.0	1.6	1.7	0.427	9.8
2937	2000	Developing	46.0	665.0	24	1.68	0.000000	79.0	1483	25.5	...	78.0	7.10	78.0	43.5	547.358879	12222251.0	11.0	11.2	0.434	9.8

	Year	Status	Life expectancy	Adult Mortality	infant deaths	Alcohol	percentage expenditure	Hepatitis B	Measles	BMI	...	Polio	Total expenditure	Diphtheria	HIV/AIDS	GDP	Population	thinness 1-19 years	thinness 5-9 years	Income composition of resources	Schooling
0	2015	Developing	65.0	263.0	62	0.01	71.279624	65.0	1154	19.1	...	6.0	8.16	65.0	0.1	584.259210	33736494.0	17.2	17.3	0.479	10.1
1	2014	Developing	59.9	271.0	64	0.01	73.523582	62.0	492	18.6	...	58.0	8.18	62.0	0.1	612.696514	327582.0	17.5	17.5	0.476	10.0
2	2013	Developing	59.9	268.0	66	0.01	73.219243	64.0	430	18.1	...	62.0	8.13	64.0	0.1	631.744976	31731688.0	17.7	17.7	0.470	9.9
3	2012	Developing	59.5	272.0	69	0.01	78.184215	67.0	2787	17.6	...	67.0	8.52	67.0	0.1	669.959000	3696958.0	17.9	18.0	0.463	9.8
4	2011	Developing	59.2	275.0	71	0.01	7.097109	68.0	3013	17.2	...	68.0	7.87	68.0	0.1	63.537231	2978599.0	18.2	18.2	0.454	9.5
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2933	2004	Developing	44.3	723.0	27	4.36	0.000000	68.0	31	27.1	...	67.0	7.13	65.0	33.6	454.366654	12777511.0	9.4	9.4	0.407	9.2
2934	2003	Developing	44.5	715.0	26	4.06	0.000000	7.0	998	26.7	...	7.0	6.52	68.0	36.7	453.351155	12633897.0	9.8	9.9	0.418	9.5
2935	2002	Developing	44.8	73.0	25	4.43	0.000000	73.0	304	26.3	...	73.0	6.53	71.0	39.8	57.348340	125525.0	1.2	1.3	0.427	10.0
2936	2001	Developing	45.3	686.0	25	1.72	0.000000	76.0	529	25.9	...	76.0	6.16	75.0	42.1	548.587312	12366165.0	1.6	1.7	0.427	9.8
2937	2000	Developing	46.0	665.0	24	1.68	0.000000	79.0	1483	25.5	...	78.0	7.10	78.0	43.5	547.358879	12222251.0	11.0	11.2	0.434	9.8

	Year	Status	Life expectancy	Adult Mortality	infant deaths	Alcohol	percentage expenditure	Hepatitis B	Measles	BMI	...	Polio	Total expenditure	Diphtheria	HIV/AIDS	GDP	Population	thinness 1-19 years	thinness 5-9 years	Income composition of resources	Schooling
0	2015	Developing	65.0	263.0	62	0.01	71.279624	65.0	1154	19.1	...	6.0	8.16	65.0	0.1	584.259210	33736494.0	17.2	17.3	0.479	10.1
1	2014	Developing	59.9	271.0	64	0.01	73.523582	62.0	492	18.6	...	58.0	8.18	62.0	0.1	612.696514	327582.0	17.5	17.5	0.476	10.0
2	2013	Developing	59.9	268.0	66	0.01	73.219243	64.0	430	18.1	...	62.0	8.13	64.0	0.1	631.744976	31731688.0	17.7	17.7	0.470	9.9
3	2012	Developing	59.5	272.0	69	0.01	78.184215	67.0	2787	17.6	...	67.0	8.52	67.0	0.1	669.959000	3696958.0	17.9	18.0	0.463	9.8
4	2011	Developing	59.2	275.0	71	0.01	7.097109	68.0	3013	17.2	...	68.0	7.87	68.0	0.1	63.537231	2978599.0	18.2	18.2	0.454	9.5
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2933	2004	Developing	44.3	723.0	27	4.36	0.000000	68.0	31	27.1	...	67.0	7.13	65.0	33.6	454.366654	12777511.0	9.4	9.4	0.407	9.2
2934	2003	Developing	44.5	715.0	26	4.06	0.000000	7.0	998	26.7	...	7.0	6.52	68.0	36.7	453.351155	12633897.0	9.8	9.9	0.418	9.5
2935	2002	Developing	44.8	73.0	25	4.43	0.000000	73.0	304	26.3	...	73.0	6.53	71.0	39.8	57.348340	125525.0	1.2	1.3	0.427	10.0
2936	2001	Developing	45.3	686.0	25	1.72	0.000000	76.0	529	25.9	...	76.0	6.16	75.0	42.1	548.587312	12366165.0	1.6	1.7	0.427	9.8
2937	2000	Developing	46.0	665.0	24	1.68	0.000000	79.0	1483	25.5	...	78.0	7.10	78.0	43.5	547.358879	12222251.0	11.0	11.2	0.434	9.8

	Year	Status	Life expectancy	Adult Mortality	infant deaths	Alcohol	percentage expenditure	Hepatitis B	Measles	BMI	...	Polio	Total expenditure	Diphtheria	HIV/AIDS	GDP	Population	thinness 1-19 years	thinness 5-9 years	Income composition of resources	Schooling
0	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
1	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
2	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
3	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
4	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2933	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
2934	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
2935	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
2936	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
2937	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False

	Developed	Developing
112	1	0
113	1	0
114	1	0
115	1	0
116	1	0