import warnings
warnings.simplefilter(action='ignore',category=FutureWarning)
warnings.simplefilter(action='ignore',category=DeprecationWarning)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

projectdata=('nba_elo_latest.csv')
data=pd.read_csv(projectdata)
data.head()

data['date'] = pd.to_datetime(data['date'])

threshold = 0.5  # Threshold for removing columns with a high percentage of missing values
columns_to_remove = data.columns[data.isnull().mean() > threshold]
data_cleaned = data.drop(columns=columns_to_remove)
data_cleaned = data_cleaned.dropna()
# Descriptive statistics for quantitative columns
quantitative_columns = data_cleaned.select_dtypes(include=['number']).columns
outlier_analysis = data_cleaned[quantitative_columns].describe()
print(outlier_analysis)

       season      neutral     elo1_pre     elo2_pre    elo_prob1  \
count  1320.0  1320.000000  1320.000000  1320.000000  1320.000000   
mean   2023.0     0.001515  1511.867655  1511.311315     0.627059   
std       0.0     0.038910    88.962661    89.571409     0.151149   
min    2023.0     0.000000  1264.103229  1257.300726     0.181341   
25%    2023.0     0.000000  1461.568345  1460.866530     0.533438   
50%    2023.0     0.000000  1524.876508  1523.136739     0.640681   
75%    2023.0     0.000000  1576.453207  1577.825540     0.736876   
max    2023.0     1.000000  1705.343075  1719.448667     0.933707   

         elo_prob2    elo1_post    elo2_post  raptor1_pre  raptor2_pre  \
count  1320.000000  1320.000000  1320.000000  1320.000000  1320.000000   
mean      0.372941  1510.664651  1512.514319  1503.864076  1499.075101   
std       0.151149    89.560581    89.306155   116.590601   116.514412   
min       0.066293  1257.300726  1271.086891   955.234235   945.804075   
25%       0.263124  1460.990452  1460.476484  1445.292160  1432.346974   
50%       0.359319  1521.917992  1525.368845  1523.778627  1522.906299   
75%       0.466562  1576.753366  1577.663170  1585.715871  1579.358239   
max       0.818659  1705.343075  1719.448667  1733.775148  1728.915073   

       raptor_prob1  raptor_prob2       score1       score2      quality  \
count   1320.000000   1320.000000  1320.000000  1320.000000  1320.000000   
mean       0.603891      0.396109   115.630303   113.030303    50.511364   
std        0.187301      0.187301    11.991075    12.001920    27.217232   
min        0.027498      0.017256    80.000000    79.000000     0.000000   
25%        0.490977      0.257375   108.000000   105.000000    29.000000   
50%        0.617850      0.382150   116.000000   113.000000    52.000000   
75%        0.742625      0.509023   124.000000   121.000000    73.000000   
max        0.982744      0.972502   175.000000   176.000000    99.000000   

        importance  total_rating  
count  1320.000000   1320.000000  
mean     32.458333     41.744697  
std      29.408726     24.238657  
min       0.000000      0.000000  
25%       9.000000     21.000000  
50%      24.000000     43.500000  
75%      49.000000     57.000000  
max     100.000000    100.000000

statistical_summary = data_cleaned.describe()
print(statistical_summary)
for column in quantitative_columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(data_cleaned[column], kde=True)
    plt.title(f'Distribution of {column}')
    plt.show()
    
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=data_cleaned[column])
    plt.title(f'Box plot of {column}')
    plt.show()

                                date  season      neutral     elo1_pre  \
count                           1320  1320.0  1320.000000  1320.000000   
mean   2023-01-19 05:33:49.090909184  2023.0     0.001515  1511.867655   
min              2022-10-18 00:00:00  2023.0     0.000000  1264.103229   
25%              2022-12-02 00:00:00  2023.0     0.000000  1461.568345   
50%              2023-01-16 00:00:00  2023.0     0.000000  1524.876508   
75%              2023-03-08 00:00:00  2023.0     0.000000  1576.453207   
max              2023-06-12 00:00:00  2023.0     1.000000  1705.343075   
std                              NaN     0.0     0.038910    88.962661   

          elo2_pre    elo_prob1    elo_prob2    elo1_post    elo2_post  \
count  1320.000000  1320.000000  1320.000000  1320.000000  1320.000000   
mean   1511.311315     0.627059     0.372941  1510.664651  1512.514319   
min    1257.300726     0.181341     0.066293  1257.300726  1271.086891   
25%    1460.866530     0.533438     0.263124  1460.990452  1460.476484   
50%    1523.136739     0.640681     0.359319  1521.917992  1525.368845   
75%    1577.825540     0.736876     0.466562  1576.753366  1577.663170   
max    1719.448667     0.933707     0.818659  1705.343075  1719.448667   
std      89.571409     0.151149     0.151149    89.560581    89.306155   

       raptor1_pre  raptor2_pre  raptor_prob1  raptor_prob2       score1  \
count  1320.000000  1320.000000   1320.000000   1320.000000  1320.000000   
mean   1503.864076  1499.075101      0.603891      0.396109   115.630303   
min     955.234235   945.804075      0.027498      0.017256    80.000000   
25%    1445.292160  1432.346974      0.490977      0.257375   108.000000   
50%    1523.778627  1522.906299      0.617850      0.382150   116.000000   
75%    1585.715871  1579.358239      0.742625      0.509023   124.000000   
max    1733.775148  1728.915073      0.982744      0.972502   175.000000   
std     116.590601   116.514412      0.187301      0.187301    11.991075   

            score2      quality   importance  total_rating  
count  1320.000000  1320.000000  1320.000000   1320.000000  
mean    113.030303    50.511364    32.458333     41.744697  
min      79.000000     0.000000     0.000000      0.000000  
25%     105.000000    29.000000     9.000000     21.000000  
50%     113.000000    52.000000    24.000000     43.500000  
75%     121.000000    73.000000    49.000000     57.000000  
max     176.000000    99.000000   100.000000    100.000000  
std      12.001920    27.217232    29.408726     24.238657

correlation_matrix = data_cleaned[quantitative_columns].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Quantitative Columns')
plt.show()

# Example pairs: ('elo1_pre', 'elo2_pre'), ('score1', 'score2'), ('elo_prob1', 'elo_prob2')
pairs_to_explore = [('elo1_pre', 'elo2_pre'), ('score1', 'score2'), ('elo_prob1', 'elo_prob2')]

for x, y in pairs_to_explore:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=data_cleaned, x=x, y=y)
    plt.title(f'Correlation between {x} and {y}')
    plt.xlabel(x)
    plt.ylabel(y)
    plt.show()

	date	season	playoff	team1	team2	elo1_pre	elo2_pre	elo_prob1	elo_prob2	...	carm-elo2_post	raptor1_pre	raptor2_pre	raptor_prob1	raptor_prob2	score1	score2	quality	importance	total_rating
0	2022-10-18	2023	NaN	BOS	PHI	1657.639749	1582.247327	0.732950	0.267050	...	NaN	1693.243079	1641.876729	0.670612	0.329388	126	117	96	13	55
1	2022-10-18	2023	NaN	GSW	LAL	1660.620307	1442.352444	0.862011	0.137989	...	NaN	1615.718147	1472.173711	0.776502	0.223498	123	109	67	20	44
2	2022-10-19	2023	NaN	IND	WAS	1399.201934	1440.077372	0.584275	0.415725	...	NaN	1462.352663	1472.018225	0.599510	0.400490	107	114	37	28	33
3	2022-10-19	2023	NaN	DET	ORL	1393.525172	1366.089249	0.675590	0.324410	...	NaN	1308.969909	1349.865183	0.563270	0.436730	113	109	3	1	2
4	2022-10-19	2023	NaN	ATL	HOU	1535.408152	1351.164973	0.837022	0.162978	...	NaN	1618.256817	1283.328356	0.917651	0.082349	117	107	24	1	13