import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt


## Preprocessing

url = 'https://www.dropbox.com/scl/fi/megbhgphwhpe6ke3b4xy5/Student_Assessment_Performance_20240311.csv?rlkey=n40grd4fw04j9d5585p3x84zu&dl=1'

# Read the CSV file into a pandas DataFrame
raw_data = pd.read_csv(url)

# Clean the data by removing missing values, removing unneeded columns, fixing column types/names as needed
# and making variables with sorted values/data 
df = raw_data.dropna()
columns_to_remove = ["District Code", "District", "School Code", "Organization", "Assessment Name", 
                     "Race", "Gender", "SpecialDemo", "Geography", "SubGroup", "RowStatus"]
df = df.drop(columns = columns_to_remove)
df["School Year"] = pd.to_numeric(df["School Year"], errors = "coerce")
df["ContentArea"] = df["ContentArea"].replace({"^MAT$": "Math","^MATH$": "Math", "ESSAY": "Essay"}, regex = True)
df["Grade"] = df["Grade"].replace("Twelfth", "12th Grade", regex = True)
custom_grade_sort = {"All Students": 0,"Kindergarten": 1, "1st Grade": 2,"2nd Grade": 3, "3rd Grade": 4, "4th Grade": 5, "5th Grade": 6,
            "6th Grade": 7,"7th Grade": 8, "8th Grade": 9, "9th Grade": 10, "10th Grade": 11, "11th Grade": 12, "12th Grade": 13}
df_grade_sorted = df.sort_values(by = "Grade", key = lambda x: x.map(custom_grade_sort))
df_content_sorted = df.sort_values(by = "ContentArea", ascending = True)

# Detect outliers in the Tested, Proficient, PctProficient, and ScaleScoreAvg columns using z-scores and remove them from dataset
columns_to_test = ["Tested", "Proficient", "PctProficient", "ScaleScoreAvg"]
for column in columns_to_test:
    z_scores = stats.zscore(df[column])
    outliers = (z_scores > 2.5) | (z_scores < -2.5)
    df = df[~outliers]

## Summary Data Analysis

# Numerical summary statistics for Tested, Proficient, PctProficient, and ScaleScoreAvg
tested_summary = df["Tested"].describe()
print("Summary statistics for Tested: \n", tested_summary, "\n")
proficient_summary = df["Proficient"].describe()
print("Summary statistics for Proficient: \n", proficient_summary, "\n")
pct_proficient_summary = df["PctProficient"].describe()
print("Summary statistics for Percent Proficient: \n", pct_proficient_summary, "\n")
scale_score_avg_summary = df["ScaleScoreAvg"].describe()
print("Summary statistics for Scale Score Average: \n", scale_score_avg_summary, "\n")

# Count and percentages of unique values for Content Area and Grade
content_area_counts = df_content_sorted["ContentArea"].value_counts()
content_area_counts_df = content_area_counts.reset_index()
content_area_counts_df.columns = ["ContentArea", "Counts"]
content_area_counts_sorted = content_area_counts_df.sort_values(by = "ContentArea", ascending = True)
print("Counts of Content Areas: \n", content_area_counts_sorted, "\n")

content_area_pcts = df_content_sorted["ContentArea"].value_counts(normalize = True)
content_area_pcts_df = content_area_pcts.reset_index()
content_area_pcts_df.columns = ["ContentArea", "Proportions"]
content_area_pcts_sorted = content_area_pcts_df.sort_values(by = "ContentArea", ascending = True)
print("Proportions of Content Areas: \n", content_area_pcts_sorted, "\n")

grade_counts = df_grade_sorted["Grade"].value_counts()
grade_counts_df = grade_counts.reset_index()
grade_counts_df.columns = ["Grade", "Counts"]
grade_counts_sorted = grade_counts_df.sort_values(by = "Grade", key = lambda x: x.map(custom_grade_sort))
print("Counts of Grades: \n", grade_counts_sorted, "\n")

grade_pcts = df_grade_sorted["Grade"].value_counts(normalize = True)
grade_pcts_df = grade_pcts.reset_index()
grade_pcts_df.columns = ["Grade", "Proportions"]
grade_pcts_sorted = grade_pcts_df.sort_values(by = "Grade", key = lambda x: x.map(custom_grade_sort))
print("Proportions of Grades: \n", grade_pcts_sorted, "\n")


## Histograms for the quantitative columns

# Histograms of Tested, Proficient, PctProficient, and ScaleScoreAvg
fig, ax = plt.subplots()
sns.histplot(data = df, x = "Tested", binwidth = 20, kde = False, ax = ax)
ax.set_xlim(0,1000)
plt.title("Distribution of Tested Students")
plt.xlabel("Number of Tested Students per Sample")
plt.ylabel("Count")
plt.xticks(rotation=45) 
plt.grid(True)
plt.show()

fig, ax = plt.subplots()
sns.histplot(data = df, x = "Proficient", binwidth = 20, kde = False, ax = ax)
ax.set_xlim(0,1000)
plt.title("Distribution of Proficient Students")
plt.xlabel("Number of Proficient Students per Sample")
plt.ylabel("Count")
plt.grid(True)
plt.show()
# can see that proficiency histogram is skewed even more to the right than tested

fig, ax = plt.subplots()
sns.histplot(data = df, x = "PctProficient", binwidth = 5, kde = False, ax = ax)
ax.set_xlim(0,100)
plt.title("Distribution of Student Proficiency Rate")
plt.xlabel("Percent of Proficient Students per Sample")
plt.ylabel("Count")
plt.xticks(rotation=45) 
plt.grid(True)
plt.show()
# has small right skew, meaning there are a larger proportion of schools with lower proficiency rates

fig, ax = plt.subplots()
sns.histplot(data = df, x = "ScaleScoreAvg", binwidth = 100, kde = False, ax = ax)
ax.set_xlim(0,2800)
plt.title("Distribution of Student Scale Score Average")
plt.xlabel("Scale Score Average per Sample")
plt.ylabel("Count")
plt.xticks(rotation=45) 
plt.grid(True)
plt.show()
# scale score avg is bimodal, meaning a large proportion of schools have either low (around 500) or (around 2500) high scale scores


## Barplots for School Year against Tested, Proficient, Percent Proficient, and Scale Score Avg

# School Year against Tested (sum)
fig, ax = plt.subplots()
sns.barplot(data = df, x = "School Year", y = "Tested", estimator = "sum")
ax.set_ylim(1000000, 4500000)
plt.yticks([500000, 1000000, 1500000, 2000000, 2500000, 3000000, 3500000, 4000000, 4500000], 
           ["0.5M", "1M", "1.5M", "2M", "2.5M", "3M", "3.5M", "4M", "4.5M"])
plt.title("Total Tested Students by School Year")
plt.xlabel("School Year")
plt.ylabel("Total Tested (in millions)")
plt.xticks(rotation=45)  
plt.grid(True)
plt.show()

# School Year against Proficient (sum and mean)
fig, ax = plt.subplots()
sns.barplot(data = df, x = "School Year", y = "Proficient", estimator = "sum")
ax.set_ylim(0, 1500000)
plt.yticks([0, 250000, 500000, 750000, 1000000, 1250000, 1500000], 
           ["0", "0.25M", ".5M", ".75M", "1M", "1.25M", "1.5M"])
plt.title("Total Proficient Students by School Year")
plt.xlabel("School Year")
plt.ylabel("Total Proficient (in millions)")
plt.xticks(rotation=45) 
plt.grid(True)
plt.show()

fig, ax = plt.subplots()
sns.barplot(data = df, x = "School Year", y = "Proficient", estimator = "mean")
ax.set_ylim(0, 50)
plt.title("Mean of Proficient Students by School Year")
plt.xlabel("School Year")
plt.ylabel("Mean of Proficient per Sample")
plt.xticks(rotation=45) 
plt.grid(True)
plt.show()

# School Year against PctProficient (mean)
fig, ax = plt.subplots()
sns.barplot(data = df, x = "School Year", y = "PctProficient", estimator = "mean")
ax.set_ylim(0, 50)
plt.title("Mean of Student Proficiency Rate by School Year")
plt.xlabel("School Year")
plt.ylabel("Mean of Proficiency Rate per Sample")
plt.xticks(rotation=45) 
plt.grid(True)
plt.show()

# School Year against ScaleScoreAvg (mean) 
sns.barplot(data = df, x = "School Year", y = "ScaleScoreAvg", estimator = "mean")
plt.title("Mean of Student Scale Score Average by School Year")
plt.xlabel("School Year")
plt.ylabel("Mean of Scale Score Average per Sample")
plt.xticks(rotation=45) 
plt.grid(True)
plt.show()


## Boxen plots for ContentArea and Grade against PctProficient and ScaleScoreAvg

# Content Area against Percent Proficient
sns.catplot(data = df_content_sorted, x = "ContentArea", y = "PctProficient", kind = "boxen", height = 5, aspect = 1.5)
plt.title("Boxen Plot of Student Proficiency Rate by Content Area")
plt.xlabel("Content Area")
plt.ylabel("Percent of Proficient Students")
plt.xticks(rotation=45) 
plt.grid(True)
plt.show()

# Content Area against ScaleScoreAverage
sns.catplot(data = df_content_sorted, x = "ContentArea", y = "ScaleScoreAvg", kind = "boxen", height = 5, aspect = 1.5)
plt.title("Boxen Plot of Student Scale Score Average by Content Area")
plt.xlabel("Content Area")
plt.ylabel("Scale Score Average of Students")
plt.xticks(rotation=45)  
plt.grid(True)
plt.show()

# Grade against PctProficient
sns.catplot(data = df_grade_sorted, x = "Grade", y = "PctProficient", kind = "boxen", height = 5, aspect = 1.5)
plt.title("Boxen Plot of Student Proficiency Rate by Grade")
plt.xlabel("Grade")
plt.ylabel("Percent of Proficient Students")
plt.xticks(rotation=45)  
plt.grid(True)
plt.show()


# Grade Area against ScaleScoreAverage
sns.catplot(data = df_grade_sorted, x = "Grade", y = "ScaleScoreAvg", kind = "boxen", height = 5, aspect = 1.5)
plt.title("Boxen Plot of Student Scale Score Average by Grade")
plt.xlabel("Grade")
plt.ylabel("Scale Score Average of Students")
plt.xticks(rotation=45) 
plt.grid(True)
plt.show()


## Correlations

years, categories = pd.factorize(df["School Year"], sort = True)
df["School Year"] = years

# Correlation between School Year and PctProficient Overall
year_prof_corr = df["PctProficient"].corr(df["School Year"]) 
print("Correlation between School Year and Proficiency Rate:", year_prof_corr, "\n")

# Correlation between School Year and ScaleScoreAvg Overall
year_score_corr = df["ScaleScoreAvg"].corr(df["School Year"]) 
print("Correlation between School Year and Scale Score Average:", year_score_corr, "\n")

# Correlation between School Year and PctProficient Grouped by ContentArea
year_prof_corr_content = df.groupby("ContentArea").apply(lambda x: x["PctProficient"].corr(x["School Year"])) 
corr_prof_content_df = year_prof_corr_content.reset_index()
corr_prof_content_df.columns = ["ContentArea", "Correlation"]
corr_prof_content_sorted = corr_prof_content_df.sort_values(by = "ContentArea", ascending = True)
print("Correlation between School Year and Proficiency Rate Grouped by Content Area: \n", corr_prof_content_sorted, "\n")

# Correlation between School Year and ScaleScoreAvg Grouped by ContentArea
year_score_corr_content = df.groupby("ContentArea").apply(lambda x: x["ScaleScoreAvg"].corr(x["School Year"])) 
corr_score_content_df = year_score_corr_content.reset_index()
corr_score_content_df.columns = ["ContentArea", "Correlation"]
corr_score_content_sorted = corr_score_content_df.sort_values(by = "ContentArea", ascending = True)
print("Correlation between School Year and Scale Score Average Grouped by Content Area: \n", corr_score_content_sorted, "\n")

# Correlation between School Year and PctProficient Grouped by Grade
year_prof_corr_grade = df.groupby("Grade").apply(lambda x: x["PctProficient"].corr(x["School Year"])) 
corr_prof_grade_df = year_prof_corr_grade.reset_index()
corr_prof_grade_df.columns = ["Grade", "Correlation"]
corr_prof_grade_sorted = corr_prof_grade_df.sort_values(by = "Grade", key = lambda x: x.map(custom_grade_sort))
print("Correlation between School Year and Proficiency Rate Grouped by Grade: \n", corr_prof_grade_sorted, "\n")

# Correlation between School Year and ScaleScoreAvg Grouped by Grade
year_score_corr_grade = df.groupby("Grade").apply(lambda x: x["ScaleScoreAvg"].corr(x["School Year"])) 
corr_score_grade_df = year_score_corr_grade.reset_index()
corr_score_grade_df.columns = ["Grade", "Correlation"]
corr_score_grade_sorted = corr_score_grade_df.sort_values(by = "Grade", key = lambda x: x.map(custom_grade_sort))
print("Correlation between School Year and Scale Score Average Grouped by Grade: \n", corr_score_grade_sorted)

/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_69223/2901388397.py:1: DeprecationWarning: 
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd

Summary statistics for Tested: 
 count    189378.000000
mean         90.303573
std         120.890589
min          15.000000
25%          29.000000
50%          51.000000
75%         104.000000
max        1921.000000
Name: Tested, dtype: float64 

Summary statistics for Proficient: 
 count    189378.000000
mean         30.098074
std          34.326381
min           5.000000
25%           9.000000
50%          17.000000
75%          36.000000
max         213.000000
Name: Proficient, dtype: float64 

Summary statistics for Percent Proficient: 
 count    189378.000000
mean         38.107623
std          18.757447
min           0.630000
25%          23.530000
50%          36.000000
75%          51.610000
max          86.670000
Name: PctProficient, dtype: float64 

Summary statistics for Scale Score Average: 
 count    189378.000000
mean       1634.078206
std        1007.053939
min           6.730000
25%         573.540000
50%        2417.130000
75%        2491.590000
max        2732.840000
Name: ScaleScoreAvg, dtype: float64 

Counts of Content Areas: 
   ContentArea  Counts
5         CMP    6716
0         ELA   73445
4       Essay   11124
1        Math   63348
2         SCI   22290
3         SOC   20243 

Proportions of Content Areas: 
   ContentArea  Proportions
5         CMP     0.034063
0         ELA     0.372503
4       Essay     0.056419
1        Math     0.321293
2         SCI     0.113052
3         SOC     0.102670 

Counts of Grades: 
            Grade  Counts
0   All Students   31157
11  Kindergarten     298
13     1st Grade     151
12     2nd Grade     261
4      3rd Grade   20966
2      4th Grade   27166
3      5th Grade   26097
7      6th Grade   14784
5      7th Grade   20155
6      8th Grade   19524
9      9th Grade    2250
8     10th Grade    4896
1     11th Grade   28929
10    12th Grade     532 

Proportions of Grades: 
            Grade  Proportions
0   All Students     0.158024
11  Kindergarten     0.001511
13     1st Grade     0.000766
12     2nd Grade     0.001324
4      3rd Grade     0.106337
2      4th Grade     0.137782
3      5th Grade     0.132361
7      6th Grade     0.074983
5      7th Grade     0.102224
6      8th Grade     0.099023
9      9th Grade     0.011412
8     10th Grade     0.024832
1     11th Grade     0.146724
10    12th Grade     0.002698

/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)

/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)

/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)

/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)

Correlation between School Year and Proficiency Rate: -0.14290967053843834 

Correlation between School Year and Scale Score Average: -0.07724222686557838 

Correlation between School Year and Proficiency Rate Grouped by Content Area: 
   ContentArea  Correlation
0         CMP    -0.041759
1         ELA    -0.132643
2       Essay    -0.044930
3        Math    -0.110230
4         SCI    -0.241068
5         SOC    -0.212785 

Correlation between School Year and Scale Score Average Grouped by Content Area: 
   ContentArea  Correlation
0         CMP    -0.050769
1         ELA     0.002079
2       Essay    -0.121308
3        Math    -0.000848
4         SCI    -0.033535
5         SOC    -0.046625 

Correlation between School Year and Proficiency Rate Grouped by Grade: 
            Grade  Correlation
12  All Students    -0.164238
13  Kindergarten     0.114729
3      1st Grade    -0.064849
4      2nd Grade    -0.141227
5      3rd Grade    -0.143128
6      4th Grade    -0.146906
7      5th Grade    -0.147483
8      6th Grade    -0.127541
9      7th Grade    -0.083614
10     8th Grade    -0.138195
11     9th Grade    -0.291444
0     10th Grade    -0.425920
1     11th Grade    -0.028476
2     12th Grade    -0.300186 

Correlation between School Year and Scale Score Average Grouped by Grade: 
            Grade  Correlation
12  All Students     0.218727
13  Kindergarten     0.094683
3      1st Grade    -0.249054
4      2nd Grade    -0.217546
5      3rd Grade    -0.038276
6      4th Grade    -0.090161
7      5th Grade    -0.166222
8      6th Grade     0.038423
9      7th Grade     0.020969
10     8th Grade    -0.075884
11     9th Grade    -0.089543
0     10th Grade    -0.076785
1     11th Grade    -0.362966
2     12th Grade     0.037264

/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_69223/2901388397.py:232: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  year_prof_corr_content = df.groupby("ContentArea").apply(lambda x: x["PctProficient"].corr(x["School Year"]))
/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_69223/2901388397.py:239: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  year_score_corr_content = df.groupby("ContentArea").apply(lambda x: x["ScaleScoreAvg"].corr(x["School Year"]))
/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_69223/2901388397.py:246: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  year_prof_corr_grade = df.groupby("Grade").apply(lambda x: x["PctProficient"].corr(x["School Year"]))
/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_69223/2901388397.py:253: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  year_score_corr_grade = df.groupby("Grade").apply(lambda x: x["ScaleScoreAvg"].corr(x["School Year"]))