import numpy as np
import pandas as pd
import seaborn as sns

/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_50817/573954172.py:2: DeprecationWarning: 
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd

title_basics = pd.read_csv("https://datasets.imdbws.com/title.basics.tsv.gz", delimiter="\t", 
                           dtype={"tconst": str, "titleType": str, "primaryTitle": str, "originalTitle": str, 
                                  "isAdult": str, "startYear": str, "endYear": str, "runtimeMinutes": str, 
                                  "genres": str})
title_basics.head(3)

title_crew = pd.read_csv("https://datasets.imdbws.com/title.crew.tsv.gz", delimiter="\t")
title_crew.head(3)

title_ratings = pd.read_csv("https://datasets.imdbws.com/title.ratings.tsv.gz", delimiter="\t")
title_ratings.head(3)

name_basics = pd.read_csv("https://datasets.imdbws.com/name.basics.tsv.gz", delimiter="\t")
name_basics.head(3)

movie_basics = title_basics[title_basics.titleType == "movie"]
movie_basics = movie_basics.drop(columns=["titleType", "originalTitle", "isAdult", "endYear"])
movie_crew = title_crew.drop(columns=["writers"])
movie_ratings = title_ratings
names = name_basics.drop(columns=["birthYear", "deathYear", "primaryProfession", "knownForTitles"])

movie_basics = movie_basics.replace("\\N", np.nan)
movie_basics = movie_basics.dropna()
movie_crew = movie_crew.replace("\\N", np.nan)
movie_crew = movie_crew.dropna()
movie_ratings = movie_ratings.replace("\\N", np.nan)
movie_ratings = movie_ratings.dropna()
names = names.replace("\\N", np.nan)
names = names.dropna()

movie_basics["startYear"] = movie_basics["startYear"].astype(int)
movie_basics["runtimeMinutes"] = movie_basics["runtimeMinutes"].astype(int)
movie_crew["directors"] = movie_crew["directors"].str.split(",")
movie_ratings["averageRating"] = movie_ratings["averageRating"].astype(float)
movie_ratings["numVotes"] = movie_ratings["numVotes"].astype(int)

all_movies = pd.merge(movie_basics, movie_crew, on="tconst")
all_movies = pd.merge(all_movies, movie_ratings, on="tconst")
# Create helper variable to match the lists of nconsts with lists of names
directors_names = all_movies[["tconst", "directors"]]
directors_names = directors_names.explode("directors")
directors_names = pd.merge(directors_names, names, left_on="directors", right_on="nconst")
directors_names = directors_names.groupby("tconst").agg({"directors": list, "primaryName": list}).reset_index()
# Merge the helper variable back into the main dataframe
all_movies = pd.merge(all_movies, directors_names, on="tconst")
all_movies.head()

movies = all_movies[all_movies["numVotes"] >= 50000]
movies = movies[movies["genres"].str.contains("Action") | movies["genres"].str.contains("Comedy") | movies["genres"].str.contains("Drama")]
for index, movie in movies.iterrows():
    if "Comedy" in movie["genres"]:
        movies.at[index, "genres"] = "Comedy"
    elif "Action" in movie["genres"]:
        movies.at[index, "genres"] = "Action"
    elif "Drama" in movie["genres"]:
        movies.at[index, "genres"] = "Drama"
movies = movies.sort_values(by="averageRating", ascending=False)
movies = movies.iloc[:1000]
movies["rank"] = range(1, len(movies) + 1)
movies.index = movies["rank"]
movies["totalRating"] = movies["averageRating"] * movies["numVotes"]
movies = movies.drop(columns=["tconst", "directors_x", "directors_y", "rank"])
movies = movies.rename(columns={"primaryTitle": "title", "startYear": "year", "runtimeMinutes": "runtime", 
                                "primaryName": "director[s]", "genres": "genre"})
movies["director[s]"] = movies["director[s]"].str.join(",")
movies = movies[["title", "year", "runtime", "director[s]", "genre", "averageRating", "numVotes", "totalRating"]]
movies.head()

quantitative_columns = movies[["year", "runtime", "averageRating", "numVotes", "totalRating"]]
Q1 = quantitative_columns.quantile(0.25)
Q3 = quantitative_columns.quantile(0.75)
IQR = Q3 - Q1
print("Here is the IQR for each column:")
print(IQR)

Here is the IQR for each column:
year                  25.000
runtime               34.000
averageRating          0.500
numVotes          359853.500
totalRating      2835101.025
dtype: float64

year_outliers = (quantitative_columns["year"] < (Q1["year"] - 1.5 * IQR["year"])) | (quantitative_columns["year"] > (Q3["year"] + 1.5 * IQR["year"]))
print("Here are the year outliers in the dataset:")
movies[year_outliers].head(3)

Here are the year outliers in the dataset:

runtime_outliers = (quantitative_columns["runtime"] < (Q1["runtime"] - 1.5 * IQR["runtime"])) | (quantitative_columns["runtime"] > (Q3["runtime"] + 1.5 * IQR["runtime"]))
print("Here are the runtime outliers in the dataset:")
movies[runtime_outliers].head(3)

Here are the runtime outliers in the dataset:

averageRating_outliers = (quantitative_columns["averageRating"] < (Q1["averageRating"] - 1.5 * IQR["averageRating"])) | (quantitative_columns["averageRating"] > (Q3["averageRating"] + 1.5 * IQR["averageRating"]))
print("Here are the averageRating outliers in the dataset:")
movies[averageRating_outliers].head(3)

Here are the averageRating outliers in the dataset:

numVotes_outliers = (quantitative_columns["numVotes"] < (Q1["numVotes"] - 1.5 * IQR["numVotes"])) | (quantitative_columns["numVotes"] > (Q3["numVotes"] + 1.5 * IQR["numVotes"]))
print("Here are the numVotes outliers in the dataset:")
movies[numVotes_outliers].head(3)

Here are the numVotes outliers in the dataset:

totalRating_outliers = (quantitative_columns["totalRating"] < (Q1["totalRating"] - 1.5 * IQR["totalRating"])) | (quantitative_columns["totalRating"] > (Q3["totalRating"] + 1.5 * IQR["totalRating"]))
print("Here are the totalRating outliers in the dataset:")
movies[totalRating_outliers].head(3)

Here are the totalRating outliers in the dataset:

movies_statistics = movies.describe()
movies_statistics = movies_statistics.loc[["mean", "min", "50%", "max", "std"]]
movies_statistics = movies_statistics.rename(index={"50%": "median"})
movies_statistics = movies_statistics.round(2)
print(movies_statistics)

           year  runtime  averageRating    numVotes  totalRating
mean    1996.81   125.16           7.88   341631.07   2756261.85
min     1921.00    45.00           7.50    50054.00    379132.50
median  2003.00   122.00           7.80   191176.50   1484413.90
max     2024.00   321.00           9.30  2874036.00  26728534.80
std       21.29    27.94           0.32   387117.15   3303533.92

means = quantitative_columns.mean()
stds = quantitative_columns.std()
z_scores = (quantitative_columns - means) / stds
print(z_scores)

          year   runtime  averageRating  numVotes  totalRating
rank                                                          
1    -0.131776  0.602522       4.493881  6.541702     7.256554
2    -1.164943  1.783588       4.176696  4.288616     4.740546
3     1.230126  0.781472       3.542325 -0.608480    -0.545343
4     0.290883  2.714124       3.542325  4.204913     4.531076
5    -1.071018  2.749914       3.542325  2.625580     2.865441
...        ...       ...            ...       ...          ...
996   0.666581 -1.151181      -1.215454  0.532851     0.409574
997  -0.131776 -0.077485      -1.215454  0.011177    -0.048910
998   0.431770  0.638312      -1.215454 -0.064730    -0.115623
999  -0.178738 -0.149065      -1.215454 -0.637588    -0.619091
1000 -0.178738 -1.008021      -1.215454 -0.716659    -0.688584

[1000 rows x 5 columns]

sns.displot(movies["year"], kind="ecdf", height=3, aspect=2)

<seaborn.axisgrid.FacetGrid at 0x103c26f50>

sns.displot(movies["averageRating"], bins=20, kind = "hist", kde = True, height=3, aspect=2)

<seaborn.axisgrid.FacetGrid at 0x374ef6e10>

nolan_movies = movies[movies["director[s]"].str.contains("Christopher Nolan")]
nolan_mean = nolan_movies["averageRating"].mean()
print("Here is the mean averageRating of Christopher Nolan's movies:")
print(nolan_mean)
print("Here is the mean averageRating of all movies:")
print(means["averageRating"])
sns.displot(nolan_movies["averageRating"], bins=20, kind = "hist", kde = True, height=3, aspect=2)

Here is the mean averageRating of Christopher Nolan's movies:
8.4625
Here is the mean averageRating of all movies:
7.8831999999999995

<seaborn.axisgrid.FacetGrid at 0x374ff1550>

print(movies["genre"].value_counts())
sns.displot(data=movies, x="numVotes", col="genre", height=3, aspect=1)

genre
Drama     539
Comedy    249
Action    212
Name: count, dtype: int64

/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)

<seaborn.axisgrid.FacetGrid at 0x374f79990>

sns.catplot(data=movies, x="genre", y="runtime", kind="box", height=4, aspect=2)

/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/categorical.py:640: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  positions = grouped.grouper.result_index.to_numpy(dtype=float)

<seaborn.axisgrid.FacetGrid at 0x37528a250>

sns.catplot(data=movies, x="totalRating", y="genre", kind="violin", height=4, aspect=2)

/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)

<seaborn.axisgrid.FacetGrid at 0x374fcdf50>

sns.relplot(data=movies, x="averageRating", y="numVotes", kind="line", errorbar="sd", height=4, aspect=2)

<seaborn.axisgrid.FacetGrid at 0x3753a9010>

sns.relplot(data=movies, x="year", y="runtime", hue="genre", height=4, aspect=2)

<seaborn.axisgrid.FacetGrid at 0x37549e090>

movies[["numVotes", "totalRating"]].corr()

print(movies["year"].corr(movies["averageRating"], "spearman"))

-0.13562520803957584

	tconst	titleType	primaryTitle	originalTitle	startYear	endYear	runtimeMinutes	genres
0	tt0000001	short	Carmencita	Carmencita	1894	\N	1	Documentary,Short
1	tt0000002	short	Le clown et ses chiens	Le clown et ses chiens	1892	\N	5	Animation,Short
2	tt0000003	short	Pauvre Pierrot	Pauvre Pierrot	1892	\N	4	Animation,Comedy,Romance

	tconst	averageRating	numVotes
0	tt0000001	5.7	2036
1	tt0000002	5.7	272
2	tt0000003	6.5	1986

	nconst	primaryName	birthYear	deathYear	primaryProfession	knownForTitles
0	nm0000001	Fred Astaire	1899	1987	actor,miscellaneous,producer	tt0072308,tt0050419,tt0053137,tt0027125
1	nm0000002	Lauren Bacall	1924	2014	actress,soundtrack,archive_footage	tt0037382,tt0075213,tt0117057,tt0038355
2	nm0000003	Brigitte Bardot	1934	\N	actress,music_department,producer	tt0057345,tt0049189,tt0056404,tt0054452

	tconst	primaryTitle	startYear	runtimeMinutes	genres	directors_x	averageRating	numVotes	directors_y	primaryName
0	tt0000009	Miss Jerry	1894	45	Romance	[nm0085156]	5.3	209	[nm0085156]	[Alexander Black]
1	tt0000147	The Corbett-Fitzsimmons Fight	1897	100	Documentary,News,Sport	[nm0714557]	5.2	506	[nm0714557]	[Enoch J. Rector]
2	tt0000574	The Story of the Kelly Gang	1906	70	Action,Adventure,Biography	[nm0846879]	6.0	876	[nm0846879]	[Charles Tait]
3	tt0000591	The Prodigal Son	1907	90	Drama	[nm0141150]	5.5	23	[nm0141150]	[Michel Carré]
4	tt0000679	The Fairylogue and Radio-Plays	1908	120	Adventure,Fantasy	[nm0091767, nm0877783]	5.2	71	[nm0091767, nm0877783]	[Francis Boggs, Otis Turner]

	title	year	runtime	director[s]	genre	averageRating	numVotes	totalRating
rank
1	The Shawshank Redemption	1994	142	Frank Darabont	Drama	9.3	2874036	26728534.8
2	The Godfather	1972	175	Francis Ford Coppola	Drama	9.2	2001828	18416817.6
3	12th Fail	2023	147	Vidhu Vinod Chopra	Drama	9.0	106078	954702.0
4	The Lord of the Rings: The Return of the King	2003	201	Peter Jackson	Action	9.0	1969425	17724825.0
5	The Godfather Part II	1974	202	Francis Ford Coppola	Drama	9.0	1358038	12222342.0

INTRODUCTION¶

PREPROCESSING¶

1. Drop Unnecessary Rows and Columns¶

2. Replace and Remove Missing Values with NaN¶

3. Specify Types¶

4. Merge the Dataframes¶

5. Polish and Reorder the Frame¶

6. Determine Outliers¶

SUMMARY DATA ANALYSIS¶

1. Summary Statistics¶

2. Distributions¶

3. Correlations¶

DISCUSSION¶

1. Can the year, runtime, average rating, and total rating of a movie, along with the name of the director, predict a movie's genre?¶

2. Do a movie's year, average rating, number of votes, and total rating, combined with the director's name and the genre, predict the movie's runtime?¶

	tconst	directors	writers
0	tt0000001	nm0005690	\N
1	tt0000002	nm0721526	\N
2	tt0000003	nm0721526	\N

	title	year	runtime	director[s]	genre	averageRating	numVotes	totalRating
rank
26	It's a Wonderful Life	1946	130	Frank Capra	Drama	8.6	497496	4278465.6
44	Modern Times	1936	87	Charles Chaplin	Comedy	8.5	258403	2196425.5
49	City Lights	1931	87	Charles Chaplin	Comedy	8.5	195320	1660220.0

	numVotes	totalRating
numVotes	1.00000	0.99795
totalRating	0.99795	1.00000