import pandas as pd

bechdel = pd.read_csv("movies.csv")

/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_90620/2446720641.py:1: DeprecationWarning: 
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd

bechdel.drop("test", axis=1, inplace=True)
bechdel.drop("imdb", axis=1, inplace=True)
bechdel.drop("title", axis=1, inplace=True)
bechdel.drop("code", axis=1, inplace=True)


bechdel.head()

bechdel = pd.get_dummies(bechdel, columns=['binary'], drop_first=True)

bechdel.head()

print("Number of missing values per column")
print(bechdel.isnull().sum())

replace_columns = [ "domgross", "intgross", "domgross_2013$", "intgross_2013$"]
bechdel[replace_columns] = bechdel[replace_columns].fillna(bechdel[replace_columns].mean())

bechdel.loc[(bechdel["decade code"].isnull()) & (bechdel["year"].between(1980, 1989)), "decade code"] = 4
bechdel.loc[(bechdel["decade code"].isnull()) & (bechdel["year"].between(1970, 1979)), "decade code"] = 5
bechdel.loc[(bechdel["period code"].isnull()) & (bechdel["year"].between(1985, 1989)), "period code"] = 6
bechdel.loc[(bechdel["period code"].isnull()) & (bechdel["year"].between(1980, 1984)), "period code"] = 7
bechdel.loc[(bechdel["period code"].isnull()) & (bechdel["year"].between(1975, 1979)), "period code"] = 8
bechdel.loc[(bechdel["period code"].isnull()) & (bechdel["year"].between(1970, 1974)), "period code"] = 9

Number of missing values per column
year                0
clean_test          0
budget              0
domgross           17
intgross           11
budget_2013$        0
domgross_2013$     18
intgross_2013$     11
period code       179
decade code       179
binary_PASS         0
dtype: int64

import numpy as np
outliers_count = None
quant_columns = ["budget", "domgross", "intgross", "budget_2013$", "domgross_2013$", "intgross_2013$"]

def count_outliers(x):
    Q25, Q75 = x.quantile([.25, .75])
    I = Q75 - Q25
    outliers = (x < Q25 - (1.5 * I)) | (x > Q75 + (1.5 * I))
    outliers_sum = outliers.sum()
    return outliers_sum

outliers_count = bechdel[quant_columns].apply(count_outliers)

outliers_count = outliers_count.rename(index=dict(zip(quant_columns, quant_columns)))
print("Number of outliers per quantitative column:")
print(outliers_count)
outliers_percentage = (outliers_count / len(bechdel)) * 100
outliers_percentage = outliers_percentage.rename(index=dict(zip(quant_columns, quant_columns)))
print("Percentage of outliers per quantitative column:")
print(outliers_percentage)

Number of outliers per quantitative column:
budget            121
domgross          117
intgross          146
budget_2013$       86
domgross_2013$    128
intgross_2013$    164
dtype: int64
Percentage of outliers per quantitative column:
budget            6.744705
domgross          6.521739
intgross          8.138239
budget_2013$      4.793757
domgross_2013$    7.134894
intgross_2013$    9.141583
dtype: float64

def is_outlier(x):
    Q25, Q75 = x.quantile([.25, .75])
    I = Q75 - Q25
    return (x < Q25 - (1.5 * I)) | (x > Q75 + (1.5 * I))

outliers = bechdel[quant_columns].apply(is_outlier)
outliers_counts = bechdel.loc[outliers.any(axis=1)].value_counts()

cleaned = bechdel[~outliers]

print( "Means with outliers:" )
print( bechdel[quant_columns].mean() )
print( "\nMeans without outliers:" )
print( cleaned[quant_columns].mean() )

Means with outliers:
budget            4.482646e+07
domgross          6.913205e+07
intgross          1.503857e+08
budget_2013$      5.546461e+07
domgross_2013$    9.517478e+07
intgross_2013$    1.978380e+08
dtype: float64

Means without outliers:
budget            3.508815e+07
domgross          5.310973e+07
intgross          1.010856e+08
budget_2013$      4.731704e+07
domgross_2013$    6.891667e+07
intgross_2013$    1.273543e+08
dtype: float64

import matplotlib.pyplot as plt

movie_counts_by_year = bechdel.groupby("year").size()
sorted_movie_counts = movie_counts_by_year.sort_index()

sorted_movie_counts.plot(kind='bar', figsize=(10, 6))
plt.xlabel('Year')
plt.ylabel('Movie Counts')
plt.title('Movie Counts by Year')
plt.show()

pass_counts = bechdel['binary_PASS'].value_counts()
ax = pass_counts.plot(kind='bar')
plt.xlabel('Passing Test')
plt.ylabel('Movies')
plt.title('Number of Movies Passing the Bechdel Test')

for i, count in enumerate(pass_counts):
    ax.text(i, count, str(count), ha='center', va='bottom')

plt.show()

import matplotlib.pyplot as plt

fig, axes = plt.subplots(3, 1, figsize=(9, 12))

pass_percentage_by_year = (bechdel.groupby("year")["binary_PASS"].sum() / bechdel.groupby("year")["binary_PASS"].count()) * 100
pass_percentage_by_year.plot(kind='line', ax=axes[0])
axes[0].set_xlabel('Year')
axes[0].set_ylabel('Pass Percentage')
axes[0].set_title('Bechdel Test Pass Percentage by Year')

pass_percentage_by_period = (bechdel.groupby("period code")["binary_PASS"].sum() / bechdel.groupby("period code")["binary_PASS"].count()) * 100
pass_percentage_by_period.plot(kind='line', ax=axes[1])
axes[1].set_xlabel('Period Code')
axes[1].set_ylabel('Pass Percentage')
axes[1].set_title('Bechdel Test Pass Percentage by Period Code')
axes[1].invert_xaxis()  # Reverse x-axis

pass_percentage_by_decade = (bechdel.groupby("decade code")["binary_PASS"].sum() / bechdel.groupby("decade code")["binary_PASS"].count()) * 100
pass_percentage_by_decade.plot(kind='line', ax=axes[2])
axes[2].set_xlabel('Decade Code')
axes[2].set_ylabel('Pass Percentage')
axes[2].set_title('Bechdel Test Pass Percentage by Decade Code')
axes[2].invert_xaxis()  # Reverse x-axis

plt.tight_layout()
plt.show()

import numpy as np
print(bechdel[['budget', 'budget_2013$']].describe())
budget = bechdel['budget']
budget_2013 = bechdel['budget_2013$']

cv_budget = np.std(budget) / np.mean(budget)
cv_budget_2013 = np.std(budget_2013) / np.mean(budget_2013)
print("\nThe coefficient of variation for budget is:")
print(cv_budget)
print("The coefficient of variation for budget_2013$ is:")
print(cv_budget_2013)

             budget  budget_2013$
count  1.794000e+03  1.794000e+03
mean   4.482646e+07  5.546461e+07
std    4.818603e+07  5.491864e+07
min    7.000000e+03  8.632000e+03
25%    1.200000e+07  1.606892e+07
50%    2.800000e+07  3.699579e+07
75%    6.000000e+07  7.833790e+07
max    4.250000e+08  4.614359e+08

The coefficient of variation for budget is:
1.0746463509357185
The coefficient of variation for budget_2013$ is:
0.9898803723687786

pass_by_period_budget = bechdel[bechdel['binary_PASS'] == 1].groupby('period code')['budget'].median() / 1000000
not_pass_by_period_budget = bechdel[bechdel['binary_PASS'] == 0].groupby('period code')['budget'].median() / 1000000

pass_by_period_budget_2013 = bechdel[bechdel['binary_PASS'] == 1].groupby('period code')['budget_2013$'].median() / 1000000
not_pass_by_period_budget_2013 = bechdel[bechdel['binary_PASS'] == 0].groupby('period code')['budget_2013$'].median() / 1000000

fig, axes = plt.subplots(2, 1, figsize=(10, 6))

pass_by_period_budget.plot(kind='bar', ax=axes[0], label='Movies Passing', alpha=0.5, position=0, width=0.4)
not_pass_by_period_budget.plot(kind='bar', ax=axes[0], color='red', label='Movies Not Passing', alpha=0.5, position=1, width=0.4)

axes[0].set_xlabel('Period Code')
axes[0].set_ylabel('Millions of Dollars')
axes[0].set_title('Median Budget for Movies by Period Code')
axes[0].legend()

pass_by_period_budget_2013.plot(kind='bar', ax=axes[1], label='Movies Passing', alpha=0.5, position=0, width=0.4)
not_pass_by_period_budget_2013.plot(kind='bar', ax=axes[1], color='red', label='Movies Not Passing', alpha=0.5, position=1, width=0.4)

axes[1].set_xlabel('Period Code')
axes[1].set_ylabel('Millions of Dollars')
axes[1].set_title('Median Budget for Movies by Period Code (Adjusted for Inflation)')
axes[1].legend()

plt.tight_layout()
plt.show()

import numpy as np
print(bechdel[['domgross', 'domgross_2013$']].describe())
domgross = bechdel['domgross']
domgross_2013 = bechdel['domgross_2013$']

cv_domgross = np.std(domgross) / np.mean(domgross)
cv_domgross_2013 = np.std(domgross_2013) / np.mean(domgross_2013)
print("\nThe coefficient of variation for domgross is:")
print(cv_domgross)
print("The coefficient of variation for domgross_2013$ is:")
print(cv_domgross_2013)

           domgross  domgross_2013$
count  1.794000e+03    1.794000e+03
mean   6.913205e+07    9.517478e+07
std    7.998541e+07    1.253315e+08
min    0.000000e+00    8.990000e+02
25%    1.667140e+07    2.067093e+07
50%    4.302788e+07    5.648654e+07
75%    9.216850e+07    1.207983e+08
max    7.605076e+08    1.771683e+09

The coefficient of variation for domgross is:
1.1566721164104834
The coefficient of variation for domgross_2013$ is:
1.3164887594090666

pass_by_period_domgross = bechdel[bechdel['binary_PASS'] == 1].groupby('period code')['domgross'].median()
not_pass_by_period_domgross = bechdel[bechdel['binary_PASS'] == 0].groupby('period code')['domgross'].median()

pass_by_period_domgross_2013 = bechdel[bechdel['binary_PASS'] == 1].groupby('period code')['domgross_2013$'].median()
not_pass_by_period_domgross_2013 = bechdel[bechdel['binary_PASS'] == 0].groupby('period code')['domgross_2013$'].median()

pass_by_period_domgross = bechdel[bechdel['binary_PASS'] == 1].groupby('period code')['domgross'].median() / 1000000
not_pass_by_period_domgross = bechdel[bechdel['binary_PASS'] == 0].groupby('period code')['domgross'].median() / 1000000

pass_by_period_domgross_2013 = bechdel[bechdel['binary_PASS'] == 1].groupby('period code')['domgross_2013$'].median() / 1000000
not_pass_by_period_domgross_2013 = bechdel[bechdel['binary_PASS'] == 0].groupby('period code')['domgross_2013$'].median() / 1000000

fig, axes = plt.subplots(2, 1, figsize=(10, 6))

width = 0.4 
x = np.arange(len(pass_by_period_domgross)) 

pass_bars = axes[0].bar(x - width/2, pass_by_period_domgross, width, label='Movies Passing', alpha=0.5)
not_pass_bars = axes[0].bar(x + width/2, not_pass_by_period_domgross, width, color='red', label='Movies Not Passing', alpha=0.5)

axes[0].set_xlabel('Period Code')
axes[0].set_ylabel('Millions of Dollars')
axes[0].set_title('Median Domestic Gross for Movies by Period Code')
axes[0].set_xticks(x)
axes[0].set_xticklabels(pass_by_period_domgross.index)
axes[0].legend()

pass_bars_2013 = axes[1].bar(x - width/2, pass_by_period_domgross_2013, width, label='Movies Passing', alpha=0.5)
not_pass_bars_2013 = axes[1].bar(x + width/2, not_pass_by_period_domgross_2013, width, color='red', label='Movies Not Passing', alpha=0.5)

axes[1].set_xlabel('Period Code')
axes[1].set_ylabel('Millions of Dollars')
axes[1].set_title('Median Domestic Gross for Movies by Period Code (Adjusted for Inflation)')
axes[1].set_xticks(x)
axes[1].set_xticklabels(pass_by_period_domgross_2013.index)
axes[1].legend()

# Adjust the layout to create space between the bars
fig.tight_layout()

plt.show()

import numpy as np
print(bechdel[['intgross', 'intgross_2013$']].describe())
intgross = bechdel['intgross']
intgross_2013 = bechdel['intgross_2013$']

cv_intgross = np.std(intgross) / np.mean(intgross)
cv_intgross_2013 = np.std(intgross_2013) / np.mean(intgross_2013)
print("\nThe coefficient of variation for intgross is:")
print(cv_intgross)
print("The coefficient of variation for intgross_2013$ is:")
print(cv_intgross_2013)

           intgross  intgross_2013$
count  1.794000e+03    1.794000e+03
mean   1.503857e+08    1.978380e+08
std    2.096891e+08    2.826370e+08
min    8.280000e+02    8.990000e+02
25%    2.632845e+07    3.357618e+07
50%    7.751440e+07    9.722534e+07
75%    1.880873e+08    2.406311e+08
max    2.783919e+09    3.171931e+09

The coefficient of variation for intgross is:
1.3939531818127984
The coefficient of variation for intgross_2013$ is:
1.4282301257903052

pass_by_period_intgross = bechdel[bechdel['binary_PASS'] == 1].groupby('period code')['intgross'].median() / 1000000
not_pass_by_period_intgross = bechdel[bechdel['binary_PASS'] == 0].groupby('period code')['intgross'].median() / 1000000

pass_by_period_intgross_2013 = bechdel[bechdel['binary_PASS'] == 1].groupby('period code')['intgross_2013$'].median() / 1000000
not_pass_by_period_intgross_2013 = bechdel[bechdel['binary_PASS'] == 0].groupby('period code')['intgross_2013$'].median() / 1000000

fig, axes = plt.subplots(2, 1, figsize=(10, 6))

width = 0.4 
x = np.arange(len(pass_by_period_intgross)) 

pass_bars = axes[0].bar(x - width/2, pass_by_period_intgross, width, label='Movies Passing', alpha=0.5)
not_pass_bars = axes[0].bar(x + width/2, not_pass_by_period_intgross, width, color='red', label='Movies Not Passing', alpha=0.5)

axes[0].set_xlabel('Period Code')
axes[0].set_ylabel('Millions of Dollars')
axes[0].set_title('Median International Gross for Movies by Period Code')
axes[0].set_xticks(x)
axes[0].set_xticklabels(pass_by_period_intgross.index)
axes[0].legend()

pass_bars_2013 = axes[1].bar(x - width/2, pass_by_period_intgross_2013, width, label='Movies Passing', alpha=0.5)
not_pass_bars_2013 = axes[1].bar(x + width/2, not_pass_by_period_intgross_2013, width, color='red', label='Movies Not Passing', alpha=0.5)

axes[1].set_xlabel('Period Code')
axes[1].set_ylabel('Millions of Dollars')
axes[1].set_title('Median International Gross for Movies by Period Code (Adjusted for Inflation)')
axes[1].set_xticks(x)
axes[1].set_xticklabels(pass_by_period_intgross_2013.index)
axes[1].legend()

fig.tight_layout()

plt.show()

import matplotlib.pyplot as plt

quant_columns = ["budget", "domgross", "intgross"]

grouped_data = bechdel.groupby(['binary_PASS', 'clean_test'])[quant_columns].median()

labels = {
    (True, 'ok'): 'Passes',
    (False, 'nowomen'): 'Fails, no women',
    (False, 'notalk'): "Fails, don't talk",
    (False, 'men'): 'Fails, talk about men only'
}

grouped_data = grouped_data / 1000000

grouped_data.plot(kind='barh', figsize=(10, 6))
plt.xlabel('Millions of Dollars')
plt.ylabel('Components of the Bechdel Test')
plt.title('Median Budget, Domestic Gross, and International Gross for Movies')
plt.legend()

plt.yticks(range(len(grouped_data.index)), [labels.get(label, label) for label in grouped_data.index])

plt.show()

import seaborn as sns

columns = ["domgross_2013$", "intgross_2013$"]
quantitative_columns = bechdel[columns + ["binary_PASS"]]

sns.pairplot(data=quantitative_columns, vars=columns, hue="binary_PASS", height=2, diag_kind="kde")

/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)

<seaborn.axisgrid.PairGrid at 0x148c2ff90>

import seaborn as sns
import matplotlib.pyplot as plt

quant_columns = ["budget", "domgross", "intgross", "budget_2013$", "domgross_2013$", "intgross_2013$"]
correlation_matrix = bechdel[quant_columns + ["binary_PASS"]].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

	year	clean_test	binary	budget	domgross	intgross	budget_2013$	domgross_2013$	intgross_2013$	period code	decade code
0	2013	notalk	FAIL	13000000	25682380.0	42195766.0	13000000	25682380.0	42195766.0	1.0	1.0
1	2012	ok	PASS	45000000	13414714.0	40868994.0	45658735	13611086.0	41467257.0	1.0	1.0
2	2013	notalk	FAIL	20000000	53107035.0	158607035.0	20000000	53107035.0	158607035.0	1.0	1.0
3	2013	notalk	FAIL	61000000	75612460.0	132493015.0	61000000	75612460.0	132493015.0	1.0	1.0
4	2013	men	FAIL	40000000	95020213.0	95020213.0	40000000	95020213.0	95020213.0	1.0	1.0

Introduction¶

Preprocessing¶

Summary Data Analysis¶

Discussion¶