import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use("ggplot")
import datetime
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, KFold
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
import nltk
nltk.download("stopwords")
stop = set(stopwords.words('english'))
import os
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import json
import ast
from urllib.request import urlopen
from PIL import Image

[nltk_data] Downloading package stopwords to /Users/mekki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

train=pd.read_csv('/Users/mekki/Python_Projects_Datasets/Box Office Data/train.csv')
test=pd.read_csv('/Users/mekki/Python_Projects_Datasets/Box Office Data/test.csv')

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:,.0f}'.format)

train.describe().T

train.describe().T

train.revenue.hist()

<Axes: >

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

sns.histplot(train['revenue'], kde=False, ax=ax1)
ax1.set_title('Distribution of revenue')

sns.histplot(np.log1p(train['revenue']), kde=False, ax=ax2)
ax2.set_title('Distribution of log-transformed revenue')

plt.tight_layout()

plt.show()

train['log_revenue']=np.log1p(train['revenue'])
train[['revenue','log_revenue']]

plt.figure(figsize=(16,8))

plt.subplot(1, 2, 1)
sns.scatterplot(data=train, x='budget', y='revenue')
plt.title('Revenue vs Budget');

plt.subplot(1, 2, 2)
sns.scatterplot(data=train, x=np.log1p(train['budget']), y=np.log1p(train['revenue']))
plt.title('Log Revenue vs Log Budget')

Text(0.5, 1.0, 'Log Revenue vs Log Budget')

train['log_budget'] = np.log1p(train['budget'])
test['log_budget'] = np.log1p(test['budget'])

train['homepage'].value_counts().head(10)

homepage
http://www.transformersmovie.com/                      4
http://www.thehobbit.com/                              2
http://www.lordoftherings.net/                         2
http://sonyclassics.com/whiplash/                      1
https://www.facebook.com/thefinesthoursmovie           1
http://sonyclassics.com/pariscanwait/                  1
http://www.maxpaynethemovie.com                        1
http://www.sonypictures.com/movies/afewgoodmen/        1
http://www.theasylum.cc/product.php?id=203             1
http://www.miramax.com/movie/the-cider-house-rules/    1
Name: count, dtype: int64

train['has homepage'] = 0
train.loc[train['homepage'].isnull() == False, 'has homepage'] = 1

test['has homepage'] = 0
test.loc[test['homepage'].isnull() == False, 'has homepage'] = 1

sns.catplot(data=train, x='has homepage', y='revenue')
plt.title('Revenue for films with and without a Homepage')

Text(0.5, 1.0, 'Revenue for films with and without a Homepage')

language_data=train.loc[train['original_language'].isin(train['original_language'].value_counts().head(10).index)]

plt.figure(figsize=(16,8))

plt.subplot(1, 2, 1)
sns.boxplot(data=language_data, x='original_language', y='revenue')
plt.title('Mean Revenue per Language')

plt.subplot(1, 2, 2)
sns.boxplot(data=language_data, x='original_language', y='log_revenue')
plt.title('Mean log Revenue per Language')

Text(0.5, 1.0, 'Mean log Revenue per Language')

plt.figure(figsize=(12,12))

text = ' '.join(train['original_title'].values)
wordcloud = WordCloud(max_font_size=None,
                     background_color='white',
                     width=1200, height=800,
                     ).generate(text)
plt.imshow(wordcloud)
plt.title('Top Words across Movie Titles')
plt.axis('off')
plt.show()

plt.figure(figsize=(12,12))

text = ' '.join(train['overview'].fillna('').values)
wordcloud = WordCloud(max_font_size=None,
                     background_color='white',
                     width=1200, height=800,
                     ).generate(text)
plt.imshow(wordcloud)
plt.title('Top Words across Movie Overviews')
plt.axis('off')
plt.show()

	count	mean	std	min	25%	50%	75%	max
id	3,000	1,500	866	1	751	1,500	2,250	3,000
budget	3,000	22,531,334	37,026,086	0	0	8,000,000	29,000,000	380,000,000
popularity	3,000	8	12	0	4	7	11	294
runtime	2,998	108	22	0	94	104	118	338
revenue	3,000	66,725,852	137,532,326	1	2,379,808	16,807,068	68,919,204	1,519,557,910
...	...	...	...	...	...	...	...	...
departments_Directing	3,000	2	1	0	1	1	2	30
departments_Editing	3,000	2	2	0	0	1	2	14
departments_Visual Effects	3,000	1	4	0	0	0	0	52
departments_Lighting	3,000	0	1	0	0	0	0	12
departments_Actors	3,000	0	0	0	0	0	0	2

	count	mean	std	min	25%	50%	75%	max
id	3,000	1,500	866	1	751	1,500	2,250	3,000
budget	3,000	22,531,334	37,026,086	0	0	8,000,000	29,000,000	380,000,000
popularity	3,000	8	12	0	4	7	11	294
runtime	2,998	108	22	0	94	104	118	338
revenue	3,000	66,725,852	137,532,326	1	2,379,808	16,807,068	68,919,204	1,519,557,910
...	...	...	...	...	...	...	...	...
departments_Directing	3,000	2	1	0	1	1	2	30
departments_Editing	3,000	2	2	0	0	1	2	14
departments_Visual Effects	3,000	1	4	0	0	0	0	52
departments_Lighting	3,000	0	1	0	0	0	0	12
departments_Actors	3,000	0	0	0	0	0	0	2

	revenue	log_revenue
0	12314651	16
1	95149435	18
2	13092000	16
3	16000000	17
4	3923970	15
...	...	...
2995	1596687	14
2996	180590	12
2997	89456761	18
2998	171963386	19
2999	82087155	18