Data Analysis with Python : Inform a Business Decision

Task 1. Import and Understand the Data

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
df=pd.read_csv('/Users/mekki/Python_Projects_Datasets/AB_NYC_2019.csv')
In [2]:
df.rename(columns={'host_id': 'host id'}, inplace=True)
df.rename(columns={'host_name': 'host name'}, inplace=True)
df.rename(columns={'neighbourhood_group': 'neighbourhood group'}, inplace=True)
df.rename(columns={'room_type': 'room type'}, inplace=True)
df.rename(columns={'minimum_nights': 'minimum nights'}, inplace=True)
df.rename(columns={'number_of_reviews': 'number of reviews'}, inplace=True)
df.rename(columns={'last_review': 'last review'}, inplace=True)
df.rename(columns={'reviews_per_month': 'reviews per month'}, inplace=True)
df.rename(columns={'calculated_host_listings_count': 'calculated host listings count'}, inplace=True)
df.rename(columns={'availability_365': 'availability 365'}, inplace=True)
df.head()
Out[2]:
id name host id host name neighbourhood group neighbourhood latitude longitude room type price minimum nights number of reviews last review reviews per month calculated host listings count availability 365
0 2539 Clean & quiet apt home by the park 2787 John Brooklyn Kensington 40.64749 -73.97237 Private room 149 1 9 2018-10-19 0.21 6 365
1 2595 Skylit Midtown Castle 2845 Jennifer Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225 1 45 2019-05-21 0.38 2 355
2 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 Elisabeth Manhattan Harlem 40.80902 -73.94190 Private room 150 3 0 NaN NaN 1 365
3 3831 Cozy Entire Floor of Brownstone 4869 LisaRoxanne Brooklyn Clinton Hill 40.68514 -73.95976 Entire home/apt 89 1 270 2019-07-05 4.64 1 194
4 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80 10 9 2018-11-19 0.10 1 0
In [3]:
df.shape
Out[3]:
(48895, 16)
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host id                         48895 non-null  int64  
 3   host name                       48874 non-null  object 
 4   neighbourhood group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum nights                  48895 non-null  int64  
 11  number of reviews               48895 non-null  int64  
 12  last review                     38843 non-null  object 
 13  reviews per month               38843 non-null  float64
 14  calculated host listings count  48895 non-null  int64  
 15  availability 365                48895 non-null  int64  
dtypes: float64(3), int64(7), object(6)
memory usage: 6.0+ MB
In [5]:
df.head()
Out[5]:
id name host id host name neighbourhood group neighbourhood latitude longitude room type price minimum nights number of reviews last review reviews per month calculated host listings count availability 365
0 2539 Clean & quiet apt home by the park 2787 John Brooklyn Kensington 40.64749 -73.97237 Private room 149 1 9 2018-10-19 0.21 6 365
1 2595 Skylit Midtown Castle 2845 Jennifer Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225 1 45 2019-05-21 0.38 2 355
2 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 Elisabeth Manhattan Harlem 40.80902 -73.94190 Private room 150 3 0 NaN NaN 1 365
3 3831 Cozy Entire Floor of Brownstone 4869 LisaRoxanne Brooklyn Clinton Hill 40.68514 -73.95976 Entire home/apt 89 1 270 2019-07-05 4.64 1 194
4 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80 10 9 2018-11-19 0.10 1 0

Task 2. Look for Nulls and replace them with 0, mean and mode.

Checking for Duplicates

In [6]:
df.duplicated().sum()
Out[6]:
0

Checking for NULLs

In [7]:
na_count = df.isna().sum()
na_ratio = df.isna().sum()/len(df)*100

nulls_df = pd.DataFrame({
    'Missing Count': na_count,
    'Missing Ratio': na_ratio
})

nulls_df
Out[7]:
Missing Count Missing Ratio
id 0 0.000000
name 16 0.032723
host id 0 0.000000
host name 21 0.042949
neighbourhood group 0 0.000000
neighbourhood 0 0.000000
latitude 0 0.000000
longitude 0 0.000000
room type 0 0.000000
price 0 0.000000
minimum nights 0 0.000000
number of reviews 0 0.000000
last review 10052 20.558339
reviews per month 10052 20.558339
calculated host listings count 0 0.000000
availability 365 0 0.000000

Plotting NULLs

In [8]:
sns.heatmap(df.isnull())
Out[8]:
<Axes: >
No description has been provided for this image

In [9]:
df=pd.read_csv('/Users/mekki/Python_Projects_Datasets/AB_NYC_2019.csv')
In [10]:
print('Number of null values is :', df.isnull().sum().sum())
Number of null values is : 20141
In [11]:
non_numeric_columns = df.select_dtypes(exclude=['number']).columns

# Fill missing values for non-numeric columns with mode
df.loc[:, non_numeric_columns]=df[non_numeric_columns].fillna(df.mode().iloc[0])
In [12]:
print('Number of null values is :', df.isnull().sum().sum())
Number of null values is : 10052
In [13]:
numeric_columns = df.select_dtypes(include=['number']).columns

# Create a copy of the subset before filling missing values
# Filling missing values for numeric columns with mean
df.loc[:, numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
In [14]:
print('Number of null values is :', df.isnull().sum().sum())
Number of null values is : 0
In [15]:
#Now if we plot it, there should be no missing values
sns.heatmap(df.isnull())
Out[15]:
<Axes: >
No description has been provided for this image

Task 3. Remove Columns and Rows not needed for the analysis

Checking for Duplicates

In [16]:
df.duplicated().sum()
# df.drop_duplicates(inplace=True)
Out[16]:
0

Removing undesired columns

In [17]:
df=pd.read_csv('/Users/mekki/Python_Projects_Datasets/AB_NYC_2019.csv')
In [18]:
df.shape #16 columns
Out[18]:
(48895, 16)
In [19]:
df.rename(columns={'host_id': 'host id'}, inplace=True)
df.rename(columns={'host_name': 'host name'}, inplace=True)
df.rename(columns={'neighbourhood_group': 'neighbourhood group'}, inplace=True)
df.rename(columns={'room_type': 'room type'}, inplace=True)
df.rename(columns={'minimum_nights': 'minimum nights'}, inplace=True)
df.rename(columns={'number_of_reviews': 'number of reviews'}, inplace=True)
df.rename(columns={'last_review': 'last review'}, inplace=True)
df.rename(columns={'reviews_per_month': 'reviews per month'}, inplace=True)
df.rename(columns={'calculated_host_listings_count': 'calculated host listings count'}, inplace=True)
df.rename(columns={'availability_365': 'availability 365'}, inplace=True)

droplist=["host name","neighbourhood","latitude","longitude","room type","calculated host listings count","reviews per month"]
df2=df.drop(droplist, axis="columns")
df2.shape
#Dropped 7 columns as shape goes from 16 columns to 9 columns
Out[19]:
(48895, 9)
In [20]:
df2.head(1)
Out[20]:
id name host id neighbourhood group price minimum nights number of reviews last review availability 365
0 2539 Clean & quiet apt home by the park 2787 Brooklyn 149 1 9 2018-10-19 365

Removing rows with NULL values

In [21]:
df2.shape
Out[21]:
(48895, 9)
In [22]:
df2.dropna(inplace=True)
df3=df2.dropna()
df3.shape
Out[22]:
(38837, 9)
In [23]:
df3.head()
Out[23]:
id name host id neighbourhood group price minimum nights number of reviews last review availability 365
0 2539 Clean & quiet apt home by the park 2787 Brooklyn 149 1 9 2018-10-19 365
1 2595 Skylit Midtown Castle 2845 Manhattan 225 1 45 2019-05-21 355
3 3831 Cozy Entire Floor of Brownstone 4869 Brooklyn 89 1 270 2019-07-05 194
4 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Manhattan 80 10 9 2018-11-19 0
5 5099 Large Cozy 1 BR Apartment In Midtown East 7322 Manhattan 200 3 74 2019-06-22 129
In [24]:
#Making sure that all data in 'last_review' are in the correct format
df3.loc[:,'last review']=pd.to_datetime(df3['last review']).dt.date
df3.head()
Out[24]:
id name host id neighbourhood group price minimum nights number of reviews last review availability 365
0 2539 Clean & quiet apt home by the park 2787 Brooklyn 149 1 9 2018-10-19 365
1 2595 Skylit Midtown Castle 2845 Manhattan 225 1 45 2019-05-21 355
3 3831 Cozy Entire Floor of Brownstone 4869 Brooklyn 89 1 270 2019-07-05 194
4 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Manhattan 80 10 9 2018-11-19 0
5 5099 Large Cozy 1 BR Apartment In Midtown East 7322 Manhattan 200 3 74 2019-06-22 129

Business task : We want to promote Airbnb units that are :

  • 100$ or less
  • Don't require more than a 2 nights minimum stay
  • Have at least 100 reviews
  • Are available at least 180 days per year
In [25]:
df4 = df3.query('price < 101 and `minimum nights` < 3 and `number of reviews` > 99 and `availability 365` > 179')
df4.shape
Out[25]:
(554, 9)
In [26]:
df4.head()
Out[26]:
id name host id neighbourhood group price minimum nights number of reviews last review availability 365
3 3831 Cozy Entire Floor of Brownstone 4869 Brooklyn 89 1 270 2019-07-05 194
7 5178 Large Furnished Room Near B'way 8967 Manhattan 79 2 430 2019-06-24 220
13 6021 Wonderful Guest Bedroom in Manhattan for SINGLES 11528 Manhattan 85 2 113 2019-07-05 333
39 12048 LowerEastSide apt share shortterm 1 7549 Manhattan 40 1 214 2019-07-05 188
50 13808 Blue Room for 2 in Brownstone for $1350 monthly 54275 Brooklyn 80 1 112 2019-06-16 251

Business task : We want to sort the data by Neighbourhood Group and Last Review

In [27]:
df4.sort_values(by=["neighbourhood group", "last review"], inplace=True, ascending=[True, True])
df5 = df4.copy()
df5.head()
Out[27]:
id name host id neighbourhood group price minimum nights number of reviews last review availability 365
11894 9260415 Comfy Room Near Bronx Zoo and NYBG! 48106825 Bronx 47 2 138 2019-05-20 353
11303 8772654 Friendly-colorful-cozy W/ breakfast 45990565 Bronx 33 2 103 2019-05-31 282
1578 715270 2 Beds/Queen & Full Beautiful Room 40 minsT.Sq... 3684360 Bronx 39 2 169 2019-06-12 306
22881 18540170 Great room across the street from the High bridge 128692351 Bronx 100 2 107 2019-06-14 344
18243 14301951 A beautiful cozy room with balcony 87370616 Bronx 75 2 110 2019-06-17 350

Business task : We want to sort the data by Neighbourhood Group and Price

In [28]:
df4.sort_values(by=["neighbourhood group","price"], inplace=True, ascending=[True, True])
promotion=df4.copy()
promotion.head()
Out[28]:
id name host id neighbourhood group price minimum nights number of reviews last review availability 365
1965 858695 Very Large Private Room on quiet st 4494343 Bronx 30 2 291 2019-06-20 208
11303 8772654 Friendly-colorful-cozy W/ breakfast 45990565 Bronx 33 2 103 2019-05-31 282
1706 773041 Nice beautiful room In the Bronx 3684360 Bronx 38 1 187 2019-06-23 241
1578 715270 2 Beds/Queen & Full Beautiful Room 40 minsT.Sq... 3684360 Bronx 39 2 169 2019-06-12 306
15793 12774885 Spacious, cozy bedroom in a private home 69507287 Bronx 40 1 151 2019-07-01 338

Business task : We want to sort the data by groups aggregating Neighbourhood Groups and their Units count.

In [29]:
grouped_counts = promotion.groupby('neighbourhood group').size().reset_index(name='count')
grouped_counts = grouped_counts.sort_values(by='count', ascending=False)
grouped_counts.head()
Out[29]:
neighbourhood group count
1 Brooklyn 231
2 Manhattan 143
3 Queens 133
0 Bronx 29
4 Staten Island 18
In [ ]:
 

Business task : Airbnb wants you to promote luxury units in the two most popular boroughs of NYC. Your job is to determine how many units in thos two boroughs cost more than 1000$ per night and have a minimum stay of one or two nights.

In [30]:
data=pd.read_csv('/Users/mekki/Python_Projects_Datasets/AB_NYC_2019.csv')
data.shape
Out[30]:
(48895, 16)
In [31]:
data
Out[31]:
id name host_id host_name neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
0 2539 Clean & quiet apt home by the park 2787 John Brooklyn Kensington 40.64749 -73.97237 Private room 149 1 9 2018-10-19 0.21 6 365
1 2595 Skylit Midtown Castle 2845 Jennifer Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225 1 45 2019-05-21 0.38 2 355
2 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 Elisabeth Manhattan Harlem 40.80902 -73.94190 Private room 150 3 0 NaN NaN 1 365
3 3831 Cozy Entire Floor of Brownstone 4869 LisaRoxanne Brooklyn Clinton Hill 40.68514 -73.95976 Entire home/apt 89 1 270 2019-07-05 4.64 1 194
4 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80 10 9 2018-11-19 0.10 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
48890 36484665 Charming one bedroom - newly renovated rowhouse 8232441 Sabrina Brooklyn Bedford-Stuyvesant 40.67853 -73.94995 Private room 70 2 0 NaN NaN 2 9
48891 36485057 Affordable room in Bushwick/East Williamsburg 6570630 Marisol Brooklyn Bushwick 40.70184 -73.93317 Private room 40 4 0 NaN NaN 2 36
48892 36485431 Sunny Studio at Historical Neighborhood 23492952 Ilgar & Aysel Manhattan Harlem 40.81475 -73.94867 Entire home/apt 115 10 0 NaN NaN 1 27
48893 36485609 43rd St. Time Square-cozy single bed 30985759 Taz Manhattan Hell's Kitchen 40.75751 -73.99112 Shared room 55 1 0 NaN NaN 6 2
48894 36487245 Trendy duplex in the very heart of Hell's Kitchen 68119814 Christophe Manhattan Hell's Kitchen 40.76404 -73.98933 Private room 90 7 0 NaN NaN 1 23

48895 rows × 16 columns

In [32]:
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)
data.shape
Out[32]:
(38821, 16)
In [33]:
data.rename(columns={'neighbourhood_group': 'neighbourhood group'}, inplace=True)
data.rename(columns={'minimum_nights': 'minimum nights'}, inplace=True)

data=data.query('price > 1000 & `minimum nights` <3')
data.shape
Out[33]:
(47, 16)
In [34]:
data=data.query('`neighbourhood group`=="Brooklyn" or `neighbourhood group`=="Manhattan"')
data.shape
Out[34]:
(46, 16)
In [35]:
data_final = data.copy()
data_final.sort_values(by=["neighbourhood group", "price"], inplace=True)
data_final.head()
Out[35]:
id name host_id host_name neighbourhood group neighbourhood latitude longitude room_type price minimum nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
23411 18948914 MODERN GARDEN 1BR IN NEWLY RENOVATED TOWNHOUSE 70804640 Akiko Brooklyn Bedford-Stuyvesant 40.68378 -73.95592 Entire home/apt 1067 2 64 2019-06-29 2.51 1 32
25064 20070296 Stunning Brooklyn Heights House, Manhattan 5 mins 5173627 Tara Brooklyn Brooklyn Heights 40.70136 -73.99372 Entire home/apt 1095 2 24 2019-03-24 1.01 1 240
10341 7915831 Lovely, Room/with Private bathroom 5719436 Leonie Brooklyn Flatbush 40.63982 -73.96634 Private room 1100 1 1 2016-09-11 0.03 1 365
2236 1056256 Beautiful eco triplex w/green roof. Free yoga/... 462379 Loretta Brooklyn Carroll Gardens 40.67881 -73.99379 Entire home/apt 1395 1 55 2019-06-02 0.73 2 362
3761 2261367 brooklyn 14 bedroom gated community 10416706 Tzvi Brooklyn Sea Gate 40.57645 -74.01065 Entire home/apt 1485 2 6 2019-06-30 0.24 1 260
In [ ]:
 
In [36]:
fig, axes = plt.subplots(2, 1, figsize=(6, 12))
sns.barplot(data=data_final, x="neighbourhood group", y="price", hue='neighbourhood group', palette=['#cdb4db','#ffa3a5'], ax=axes[0])
axes[0].set_title('Average Prices of Units per Neighbourhood');
sns.histplot(data=data_final, x="neighbourhood group", y="price", cbar=True, color='#52b69a', ax=axes[1])
axes[1].set_title('Distribution of Prices within each Neighbourhood')
Out[36]:
Text(0.5, 1.0, 'Distribution of Prices within each Neighbourhood')
No description has been provided for this image