Analyze Box Office Data with Python

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use("ggplot")
import datetime
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, KFold
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
import nltk
nltk.download("stopwords")
stop = set(stopwords.words('english'))
import os
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import json
import ast
from urllib.request import urlopen
from PIL import Image
[nltk_data] Downloading package stopwords to /Users/mekki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

Task 1. Visualizing the Target Distribution

In [2]:
train=pd.read_csv('/Users/mekki/Python_Projects_Datasets/Box Office Data/train.csv')
test=pd.read_csv('/Users/mekki/Python_Projects_Datasets/Box Office Data/test.csv')
In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:,.0f}'.format)
In [4]:
train.describe().T
Out[4]:
count mean std min 25% 50% 75% max
id 3,000 1,500 866 1 751 1,500 2,250 3,000
budget 3,000 22,531,334 37,026,086 0 0 8,000,000 29,000,000 380,000,000
popularity 3,000 8 12 0 4 7 11 294
runtime 2,998 108 22 0 94 104 118 338
revenue 3,000 66,725,852 137,532,326 1 2,379,808 16,807,068 68,919,204 1,519,557,910
... ... ... ... ... ... ... ... ...
departments_Directing 3,000 2 1 0 1 1 2 30
departments_Editing 3,000 2 2 0 0 1 2 14
departments_Visual Effects 3,000 1 4 0 0 0 0 52
departments_Lighting 3,000 0 1 0 0 0 0 12
departments_Actors 3,000 0 0 0 0 0 0 2

190 rows × 8 columns

In [5]:
train.describe().T
Out[5]:
count mean std min 25% 50% 75% max
id 3,000 1,500 866 1 751 1,500 2,250 3,000
budget 3,000 22,531,334 37,026,086 0 0 8,000,000 29,000,000 380,000,000
popularity 3,000 8 12 0 4 7 11 294
runtime 2,998 108 22 0 94 104 118 338
revenue 3,000 66,725,852 137,532,326 1 2,379,808 16,807,068 68,919,204 1,519,557,910
... ... ... ... ... ... ... ... ...
departments_Directing 3,000 2 1 0 1 1 2 30
departments_Editing 3,000 2 2 0 0 1 2 14
departments_Visual Effects 3,000 1 4 0 0 0 0 52
departments_Lighting 3,000 0 1 0 0 0 0 12
departments_Actors 3,000 0 0 0 0 0 0 2

190 rows × 8 columns

Plot the Histogram of the distribution of Revenue

In [6]:
train.revenue.hist()
Out[6]:
<Axes: >
No description has been provided for this image

Plot the log transformed Histogram of the distribution of Revenue

In [7]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

sns.histplot(train['revenue'], kde=False, ax=ax1)
ax1.set_title('Distribution of revenue')

sns.histplot(np.log1p(train['revenue']), kde=False, ax=ax2)
ax2.set_title('Distribution of log-transformed revenue')

plt.tight_layout()

plt.show()
No description has been provided for this image
In [8]:
train['log_revenue']=np.log1p(train['revenue'])
train[['revenue','log_revenue']]
Out[8]:
revenue log_revenue
0 12314651 16
1 95149435 18
2 13092000 16
3 16000000 17
4 3923970 15
... ... ...
2995 1596687 14
2996 180590 12
2997 89456761 18
2998 171963386 19
2999 82087155 18

3000 rows × 2 columns

Task 3. Comparing Film Revenue to Budget

In [9]:
plt.figure(figsize=(16,8))

plt.subplot(1, 2, 1)
sns.scatterplot(data=train, x='budget', y='revenue')
plt.title('Revenue vs Budget');

plt.subplot(1, 2, 2)
sns.scatterplot(data=train, x=np.log1p(train['budget']), y=np.log1p(train['revenue']))
plt.title('Log Revenue vs Log Budget')
Out[9]:
Text(0.5, 1.0, 'Log Revenue vs Log Budget')
No description has been provided for this image
In [10]:
train['log_budget'] = np.log1p(train['budget'])
test['log_budget'] = np.log1p(test['budget'])

Task 4. Do Official Homepages impact Revenue

In [11]:
train['homepage'].value_counts().head(10)
Out[11]:
homepage
http://www.transformersmovie.com/                      4
http://www.thehobbit.com/                              2
http://www.lordoftherings.net/                         2
http://sonyclassics.com/whiplash/                      1
https://www.facebook.com/thefinesthoursmovie           1
http://sonyclassics.com/pariscanwait/                  1
http://www.maxpaynethemovie.com                        1
http://www.sonypictures.com/movies/afewgoodmen/        1
http://www.theasylum.cc/product.php?id=203             1
http://www.miramax.com/movie/the-cider-house-rules/    1
Name: count, dtype: int64

Creating a boolean

In [12]:
train['has homepage'] = 0
train.loc[train['homepage'].isnull() == False, 'has homepage'] = 1

test['has homepage'] = 0
test.loc[test['homepage'].isnull() == False, 'has homepage'] = 1
In [13]:
sns.catplot(data=train, x='has homepage', y='revenue')
plt.title('Revenue for films with and without a Homepage')
Out[13]:
Text(0.5, 1.0, 'Revenue for films with and without a Homepage')
No description has been provided for this image

Task 5. Does the Distribution of languages impact Revenue

In [14]:
language_data=train.loc[train['original_language'].isin(train['original_language'].value_counts().head(10).index)]
In [15]:
plt.figure(figsize=(16,8))

plt.subplot(1, 2, 1)
sns.boxplot(data=language_data, x='original_language', y='revenue')
plt.title('Mean Revenue per Language')

plt.subplot(1, 2, 2)
sns.boxplot(data=language_data, x='original_language', y='log_revenue')
plt.title('Mean log Revenue per Language')
Out[15]:
Text(0.5, 1.0, 'Mean log Revenue per Language')
No description has been provided for this image

Task 6. Common Words in Titles and how they impact Revenue

Top Words across Movie Titles

In [16]:
plt.figure(figsize=(12,12))

text = ' '.join(train['original_title'].values)
wordcloud = WordCloud(max_font_size=None,
                     background_color='white',
                     width=1200, height=800,
                     ).generate(text)
plt.imshow(wordcloud)
plt.title('Top Words across Movie Titles')
plt.axis('off')
plt.show()
No description has been provided for this image

Top Words across Movie Overviews

In [17]:
plt.figure(figsize=(12,12))

text = ' '.join(train['overview'].fillna('').values)
wordcloud = WordCloud(max_font_size=None,
                     background_color='white',
                     width=1200, height=800,
                     ).generate(text)
plt.imshow(wordcloud)
plt.title('Top Words across Movie Overviews')
plt.axis('off')
plt.show()
No description has been provided for this image