Analyze Box Office Data with Python
In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use("ggplot")
import datetime
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, KFold
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
import nltk
nltk.download("stopwords")
stop = set(stopwords.words('english'))
import os
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import json
import ast
from urllib.request import urlopen
from PIL import Image
[nltk_data] Downloading package stopwords to /Users/mekki/nltk_data... [nltk_data] Package stopwords is already up-to-date!
Task 1. Visualizing the Target Distribution
In [2]:
train=pd.read_csv('/Users/mekki/Python_Projects_Datasets/Box Office Data/train.csv')
test=pd.read_csv('/Users/mekki/Python_Projects_Datasets/Box Office Data/test.csv')
In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:,.0f}'.format)
In [4]:
train.describe().T
Out[4]:
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
id | 3,000 | 1,500 | 866 | 1 | 751 | 1,500 | 2,250 | 3,000 |
budget | 3,000 | 22,531,334 | 37,026,086 | 0 | 0 | 8,000,000 | 29,000,000 | 380,000,000 |
popularity | 3,000 | 8 | 12 | 0 | 4 | 7 | 11 | 294 |
runtime | 2,998 | 108 | 22 | 0 | 94 | 104 | 118 | 338 |
revenue | 3,000 | 66,725,852 | 137,532,326 | 1 | 2,379,808 | 16,807,068 | 68,919,204 | 1,519,557,910 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
departments_Directing | 3,000 | 2 | 1 | 0 | 1 | 1 | 2 | 30 |
departments_Editing | 3,000 | 2 | 2 | 0 | 0 | 1 | 2 | 14 |
departments_Visual Effects | 3,000 | 1 | 4 | 0 | 0 | 0 | 0 | 52 |
departments_Lighting | 3,000 | 0 | 1 | 0 | 0 | 0 | 0 | 12 |
departments_Actors | 3,000 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
190 rows × 8 columns
In [5]:
train.describe().T
Out[5]:
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
id | 3,000 | 1,500 | 866 | 1 | 751 | 1,500 | 2,250 | 3,000 |
budget | 3,000 | 22,531,334 | 37,026,086 | 0 | 0 | 8,000,000 | 29,000,000 | 380,000,000 |
popularity | 3,000 | 8 | 12 | 0 | 4 | 7 | 11 | 294 |
runtime | 2,998 | 108 | 22 | 0 | 94 | 104 | 118 | 338 |
revenue | 3,000 | 66,725,852 | 137,532,326 | 1 | 2,379,808 | 16,807,068 | 68,919,204 | 1,519,557,910 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
departments_Directing | 3,000 | 2 | 1 | 0 | 1 | 1 | 2 | 30 |
departments_Editing | 3,000 | 2 | 2 | 0 | 0 | 1 | 2 | 14 |
departments_Visual Effects | 3,000 | 1 | 4 | 0 | 0 | 0 | 0 | 52 |
departments_Lighting | 3,000 | 0 | 1 | 0 | 0 | 0 | 0 | 12 |
departments_Actors | 3,000 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
190 rows × 8 columns
Plot the Histogram of the distribution of Revenue
In [6]:
train.revenue.hist()
Out[6]:
<Axes: >
Plot the log transformed Histogram of the distribution of Revenue
In [7]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
sns.histplot(train['revenue'], kde=False, ax=ax1)
ax1.set_title('Distribution of revenue')
sns.histplot(np.log1p(train['revenue']), kde=False, ax=ax2)
ax2.set_title('Distribution of log-transformed revenue')
plt.tight_layout()
plt.show()
In [8]:
train['log_revenue']=np.log1p(train['revenue'])
train[['revenue','log_revenue']]
Out[8]:
revenue | log_revenue | |
---|---|---|
0 | 12314651 | 16 |
1 | 95149435 | 18 |
2 | 13092000 | 16 |
3 | 16000000 | 17 |
4 | 3923970 | 15 |
... | ... | ... |
2995 | 1596687 | 14 |
2996 | 180590 | 12 |
2997 | 89456761 | 18 |
2998 | 171963386 | 19 |
2999 | 82087155 | 18 |
3000 rows × 2 columns
Task 3. Comparing Film Revenue to Budget
In [9]:
plt.figure(figsize=(16,8))
plt.subplot(1, 2, 1)
sns.scatterplot(data=train, x='budget', y='revenue')
plt.title('Revenue vs Budget');
plt.subplot(1, 2, 2)
sns.scatterplot(data=train, x=np.log1p(train['budget']), y=np.log1p(train['revenue']))
plt.title('Log Revenue vs Log Budget')
Out[9]:
Text(0.5, 1.0, 'Log Revenue vs Log Budget')
In [10]:
train['log_budget'] = np.log1p(train['budget'])
test['log_budget'] = np.log1p(test['budget'])
Task 4. Do Official Homepages impact Revenue
In [11]:
train['homepage'].value_counts().head(10)
Out[11]:
homepage http://www.transformersmovie.com/ 4 http://www.thehobbit.com/ 2 http://www.lordoftherings.net/ 2 http://sonyclassics.com/whiplash/ 1 https://www.facebook.com/thefinesthoursmovie 1 http://sonyclassics.com/pariscanwait/ 1 http://www.maxpaynethemovie.com 1 http://www.sonypictures.com/movies/afewgoodmen/ 1 http://www.theasylum.cc/product.php?id=203 1 http://www.miramax.com/movie/the-cider-house-rules/ 1 Name: count, dtype: int64
Creating a boolean
In [12]:
train['has homepage'] = 0
train.loc[train['homepage'].isnull() == False, 'has homepage'] = 1
test['has homepage'] = 0
test.loc[test['homepage'].isnull() == False, 'has homepage'] = 1
In [13]:
sns.catplot(data=train, x='has homepage', y='revenue')
plt.title('Revenue for films with and without a Homepage')
Out[13]:
Text(0.5, 1.0, 'Revenue for films with and without a Homepage')
Task 5. Does the Distribution of languages impact Revenue
In [14]:
language_data=train.loc[train['original_language'].isin(train['original_language'].value_counts().head(10).index)]
In [15]:
plt.figure(figsize=(16,8))
plt.subplot(1, 2, 1)
sns.boxplot(data=language_data, x='original_language', y='revenue')
plt.title('Mean Revenue per Language')
plt.subplot(1, 2, 2)
sns.boxplot(data=language_data, x='original_language', y='log_revenue')
plt.title('Mean log Revenue per Language')
Out[15]:
Text(0.5, 1.0, 'Mean log Revenue per Language')
Task 6. Common Words in Titles and how they impact Revenue
Top Words across Movie Titles
In [16]:
plt.figure(figsize=(12,12))
text = ' '.join(train['original_title'].values)
wordcloud = WordCloud(max_font_size=None,
background_color='white',
width=1200, height=800,
).generate(text)
plt.imshow(wordcloud)
plt.title('Top Words across Movie Titles')
plt.axis('off')
plt.show()
Top Words across Movie Overviews
In [17]:
plt.figure(figsize=(12,12))
text = ' '.join(train['overview'].fillna('').values)
wordcloud = WordCloud(max_font_size=None,
background_color='white',
width=1200, height=800,
).generate(text)
plt.imshow(wordcloud)
plt.title('Top Words across Movie Overviews')
plt.axis('off')
plt.show()