In [1]:
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
## Preprocessing
url = 'https://www.dropbox.com/scl/fi/megbhgphwhpe6ke3b4xy5/Student_Assessment_Performance_20240311.csv?rlkey=n40grd4fw04j9d5585p3x84zu&dl=1'
# Read the CSV file into a pandas DataFrame
raw_data = pd.read_csv(url)
# Clean the data by removing missing values, removing unneeded columns, fixing column types/names as needed
# and making variables with sorted values/data
df = raw_data.dropna()
columns_to_remove = ["District Code", "District", "School Code", "Organization", "Assessment Name",
"Race", "Gender", "SpecialDemo", "Geography", "SubGroup", "RowStatus"]
df = df.drop(columns = columns_to_remove)
df["School Year"] = pd.to_numeric(df["School Year"], errors = "coerce")
df["ContentArea"] = df["ContentArea"].replace({"^MAT$": "Math","^MATH$": "Math", "ESSAY": "Essay"}, regex = True)
df["Grade"] = df["Grade"].replace("Twelfth", "12th Grade", regex = True)
custom_grade_sort = {"All Students": 0,"Kindergarten": 1, "1st Grade": 2,"2nd Grade": 3, "3rd Grade": 4, "4th Grade": 5, "5th Grade": 6,
"6th Grade": 7,"7th Grade": 8, "8th Grade": 9, "9th Grade": 10, "10th Grade": 11, "11th Grade": 12, "12th Grade": 13}
df_grade_sorted = df.sort_values(by = "Grade", key = lambda x: x.map(custom_grade_sort))
df_content_sorted = df.sort_values(by = "ContentArea", ascending = True)
# Detect outliers in the Tested, Proficient, PctProficient, and ScaleScoreAvg columns using z-scores and remove them from dataset
columns_to_test = ["Tested", "Proficient", "PctProficient", "ScaleScoreAvg"]
for column in columns_to_test:
z_scores = stats.zscore(df[column])
outliers = (z_scores > 2.5) | (z_scores < -2.5)
df = df[~outliers]
## Summary Data Analysis
# Numerical summary statistics for Tested, Proficient, PctProficient, and ScaleScoreAvg
tested_summary = df["Tested"].describe()
print("Summary statistics for Tested: \n", tested_summary, "\n")
proficient_summary = df["Proficient"].describe()
print("Summary statistics for Proficient: \n", proficient_summary, "\n")
pct_proficient_summary = df["PctProficient"].describe()
print("Summary statistics for Percent Proficient: \n", pct_proficient_summary, "\n")
scale_score_avg_summary = df["ScaleScoreAvg"].describe()
print("Summary statistics for Scale Score Average: \n", scale_score_avg_summary, "\n")
# Count and percentages of unique values for Content Area and Grade
content_area_counts = df_content_sorted["ContentArea"].value_counts()
content_area_counts_df = content_area_counts.reset_index()
content_area_counts_df.columns = ["ContentArea", "Counts"]
content_area_counts_sorted = content_area_counts_df.sort_values(by = "ContentArea", ascending = True)
print("Counts of Content Areas: \n", content_area_counts_sorted, "\n")
content_area_pcts = df_content_sorted["ContentArea"].value_counts(normalize = True)
content_area_pcts_df = content_area_pcts.reset_index()
content_area_pcts_df.columns = ["ContentArea", "Proportions"]
content_area_pcts_sorted = content_area_pcts_df.sort_values(by = "ContentArea", ascending = True)
print("Proportions of Content Areas: \n", content_area_pcts_sorted, "\n")
grade_counts = df_grade_sorted["Grade"].value_counts()
grade_counts_df = grade_counts.reset_index()
grade_counts_df.columns = ["Grade", "Counts"]
grade_counts_sorted = grade_counts_df.sort_values(by = "Grade", key = lambda x: x.map(custom_grade_sort))
print("Counts of Grades: \n", grade_counts_sorted, "\n")
grade_pcts = df_grade_sorted["Grade"].value_counts(normalize = True)
grade_pcts_df = grade_pcts.reset_index()
grade_pcts_df.columns = ["Grade", "Proportions"]
grade_pcts_sorted = grade_pcts_df.sort_values(by = "Grade", key = lambda x: x.map(custom_grade_sort))
print("Proportions of Grades: \n", grade_pcts_sorted, "\n")
## Histograms for the quantitative columns
# Histograms of Tested, Proficient, PctProficient, and ScaleScoreAvg
fig, ax = plt.subplots()
sns.histplot(data = df, x = "Tested", binwidth = 20, kde = False, ax = ax)
ax.set_xlim(0,1000)
plt.title("Distribution of Tested Students")
plt.xlabel("Number of Tested Students per Sample")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
fig, ax = plt.subplots()
sns.histplot(data = df, x = "Proficient", binwidth = 20, kde = False, ax = ax)
ax.set_xlim(0,1000)
plt.title("Distribution of Proficient Students")
plt.xlabel("Number of Proficient Students per Sample")
plt.ylabel("Count")
plt.grid(True)
plt.show()
# can see that proficiency histogram is skewed even more to the right than tested
fig, ax = plt.subplots()
sns.histplot(data = df, x = "PctProficient", binwidth = 5, kde = False, ax = ax)
ax.set_xlim(0,100)
plt.title("Distribution of Student Proficiency Rate")
plt.xlabel("Percent of Proficient Students per Sample")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
# has small right skew, meaning there are a larger proportion of schools with lower proficiency rates
fig, ax = plt.subplots()
sns.histplot(data = df, x = "ScaleScoreAvg", binwidth = 100, kde = False, ax = ax)
ax.set_xlim(0,2800)
plt.title("Distribution of Student Scale Score Average")
plt.xlabel("Scale Score Average per Sample")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
# scale score avg is bimodal, meaning a large proportion of schools have either low (around 500) or (around 2500) high scale scores
## Barplots for School Year against Tested, Proficient, Percent Proficient, and Scale Score Avg
# School Year against Tested (sum)
fig, ax = plt.subplots()
sns.barplot(data = df, x = "School Year", y = "Tested", estimator = "sum")
ax.set_ylim(1000000, 4500000)
plt.yticks([500000, 1000000, 1500000, 2000000, 2500000, 3000000, 3500000, 4000000, 4500000],
["0.5M", "1M", "1.5M", "2M", "2.5M", "3M", "3.5M", "4M", "4.5M"])
plt.title("Total Tested Students by School Year")
plt.xlabel("School Year")
plt.ylabel("Total Tested (in millions)")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
# School Year against Proficient (sum and mean)
fig, ax = plt.subplots()
sns.barplot(data = df, x = "School Year", y = "Proficient", estimator = "sum")
ax.set_ylim(0, 1500000)
plt.yticks([0, 250000, 500000, 750000, 1000000, 1250000, 1500000],
["0", "0.25M", ".5M", ".75M", "1M", "1.25M", "1.5M"])
plt.title("Total Proficient Students by School Year")
plt.xlabel("School Year")
plt.ylabel("Total Proficient (in millions)")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
fig, ax = plt.subplots()
sns.barplot(data = df, x = "School Year", y = "Proficient", estimator = "mean")
ax.set_ylim(0, 50)
plt.title("Mean of Proficient Students by School Year")
plt.xlabel("School Year")
plt.ylabel("Mean of Proficient per Sample")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
# School Year against PctProficient (mean)
fig, ax = plt.subplots()
sns.barplot(data = df, x = "School Year", y = "PctProficient", estimator = "mean")
ax.set_ylim(0, 50)
plt.title("Mean of Student Proficiency Rate by School Year")
plt.xlabel("School Year")
plt.ylabel("Mean of Proficiency Rate per Sample")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
# School Year against ScaleScoreAvg (mean)
sns.barplot(data = df, x = "School Year", y = "ScaleScoreAvg", estimator = "mean")
plt.title("Mean of Student Scale Score Average by School Year")
plt.xlabel("School Year")
plt.ylabel("Mean of Scale Score Average per Sample")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
## Boxen plots for ContentArea and Grade against PctProficient and ScaleScoreAvg
# Content Area against Percent Proficient
sns.catplot(data = df_content_sorted, x = "ContentArea", y = "PctProficient", kind = "boxen", height = 5, aspect = 1.5)
plt.title("Boxen Plot of Student Proficiency Rate by Content Area")
plt.xlabel("Content Area")
plt.ylabel("Percent of Proficient Students")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
# Content Area against ScaleScoreAverage
sns.catplot(data = df_content_sorted, x = "ContentArea", y = "ScaleScoreAvg", kind = "boxen", height = 5, aspect = 1.5)
plt.title("Boxen Plot of Student Scale Score Average by Content Area")
plt.xlabel("Content Area")
plt.ylabel("Scale Score Average of Students")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
# Grade against PctProficient
sns.catplot(data = df_grade_sorted, x = "Grade", y = "PctProficient", kind = "boxen", height = 5, aspect = 1.5)
plt.title("Boxen Plot of Student Proficiency Rate by Grade")
plt.xlabel("Grade")
plt.ylabel("Percent of Proficient Students")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
# Grade Area against ScaleScoreAverage
sns.catplot(data = df_grade_sorted, x = "Grade", y = "ScaleScoreAvg", kind = "boxen", height = 5, aspect = 1.5)
plt.title("Boxen Plot of Student Scale Score Average by Grade")
plt.xlabel("Grade")
plt.ylabel("Scale Score Average of Students")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
## Correlations
years, categories = pd.factorize(df["School Year"], sort = True)
df["School Year"] = years
# Correlation between School Year and PctProficient Overall
year_prof_corr = df["PctProficient"].corr(df["School Year"])
print("Correlation between School Year and Proficiency Rate:", year_prof_corr, "\n")
# Correlation between School Year and ScaleScoreAvg Overall
year_score_corr = df["ScaleScoreAvg"].corr(df["School Year"])
print("Correlation between School Year and Scale Score Average:", year_score_corr, "\n")
# Correlation between School Year and PctProficient Grouped by ContentArea
year_prof_corr_content = df.groupby("ContentArea").apply(lambda x: x["PctProficient"].corr(x["School Year"]))
corr_prof_content_df = year_prof_corr_content.reset_index()
corr_prof_content_df.columns = ["ContentArea", "Correlation"]
corr_prof_content_sorted = corr_prof_content_df.sort_values(by = "ContentArea", ascending = True)
print("Correlation between School Year and Proficiency Rate Grouped by Content Area: \n", corr_prof_content_sorted, "\n")
# Correlation between School Year and ScaleScoreAvg Grouped by ContentArea
year_score_corr_content = df.groupby("ContentArea").apply(lambda x: x["ScaleScoreAvg"].corr(x["School Year"]))
corr_score_content_df = year_score_corr_content.reset_index()
corr_score_content_df.columns = ["ContentArea", "Correlation"]
corr_score_content_sorted = corr_score_content_df.sort_values(by = "ContentArea", ascending = True)
print("Correlation between School Year and Scale Score Average Grouped by Content Area: \n", corr_score_content_sorted, "\n")
# Correlation between School Year and PctProficient Grouped by Grade
year_prof_corr_grade = df.groupby("Grade").apply(lambda x: x["PctProficient"].corr(x["School Year"]))
corr_prof_grade_df = year_prof_corr_grade.reset_index()
corr_prof_grade_df.columns = ["Grade", "Correlation"]
corr_prof_grade_sorted = corr_prof_grade_df.sort_values(by = "Grade", key = lambda x: x.map(custom_grade_sort))
print("Correlation between School Year and Proficiency Rate Grouped by Grade: \n", corr_prof_grade_sorted, "\n")
# Correlation between School Year and ScaleScoreAvg Grouped by Grade
year_score_corr_grade = df.groupby("Grade").apply(lambda x: x["ScaleScoreAvg"].corr(x["School Year"]))
corr_score_grade_df = year_score_corr_grade.reset_index()
corr_score_grade_df.columns = ["Grade", "Correlation"]
corr_score_grade_sorted = corr_score_grade_df.sort_values(by = "Grade", key = lambda x: x.map(custom_grade_sort))
print("Correlation between School Year and Scale Score Average Grouped by Grade: \n", corr_score_grade_sorted)
/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_69223/2901388397.py:1: DeprecationWarning: Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0), (to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries) but was not found to be installed on your system. If this would cause problems for you, please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466 import pandas as pd
Summary statistics for Tested: count 189378.000000 mean 90.303573 std 120.890589 min 15.000000 25% 29.000000 50% 51.000000 75% 104.000000 max 1921.000000 Name: Tested, dtype: float64 Summary statistics for Proficient: count 189378.000000 mean 30.098074 std 34.326381 min 5.000000 25% 9.000000 50% 17.000000 75% 36.000000 max 213.000000 Name: Proficient, dtype: float64 Summary statistics for Percent Proficient: count 189378.000000 mean 38.107623 std 18.757447 min 0.630000 25% 23.530000 50% 36.000000 75% 51.610000 max 86.670000 Name: PctProficient, dtype: float64 Summary statistics for Scale Score Average: count 189378.000000 mean 1634.078206 std 1007.053939 min 6.730000 25% 573.540000 50% 2417.130000 75% 2491.590000 max 2732.840000 Name: ScaleScoreAvg, dtype: float64 Counts of Content Areas: ContentArea Counts 5 CMP 6716 0 ELA 73445 4 Essay 11124 1 Math 63348 2 SCI 22290 3 SOC 20243 Proportions of Content Areas: ContentArea Proportions 5 CMP 0.034063 0 ELA 0.372503 4 Essay 0.056419 1 Math 0.321293 2 SCI 0.113052 3 SOC 0.102670 Counts of Grades: Grade Counts 0 All Students 31157 11 Kindergarten 298 13 1st Grade 151 12 2nd Grade 261 4 3rd Grade 20966 2 4th Grade 27166 3 5th Grade 26097 7 6th Grade 14784 5 7th Grade 20155 6 8th Grade 19524 9 9th Grade 2250 8 10th Grade 4896 1 11th Grade 28929 10 12th Grade 532 Proportions of Grades: Grade Proportions 0 All Students 0.158024 11 Kindergarten 0.001511 13 1st Grade 0.000766 12 2nd Grade 0.001324 4 3rd Grade 0.106337 2 4th Grade 0.137782 3 5th Grade 0.132361 7 6th Grade 0.074983 5 7th Grade 0.102224 6 8th Grade 0.099023 9 9th Grade 0.011412 8 10th Grade 0.024832 1 11th Grade 0.146724 10 12th Grade 0.002698
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. data_subset = grouped_data.get_group(pd_key)
Correlation between School Year and Proficiency Rate: -0.14290967053843834 Correlation between School Year and Scale Score Average: -0.07724222686557838 Correlation between School Year and Proficiency Rate Grouped by Content Area: ContentArea Correlation 0 CMP -0.041759 1 ELA -0.132643 2 Essay -0.044930 3 Math -0.110230 4 SCI -0.241068 5 SOC -0.212785 Correlation between School Year and Scale Score Average Grouped by Content Area: ContentArea Correlation 0 CMP -0.050769 1 ELA 0.002079 2 Essay -0.121308 3 Math -0.000848 4 SCI -0.033535 5 SOC -0.046625 Correlation between School Year and Proficiency Rate Grouped by Grade: Grade Correlation 12 All Students -0.164238 13 Kindergarten 0.114729 3 1st Grade -0.064849 4 2nd Grade -0.141227 5 3rd Grade -0.143128 6 4th Grade -0.146906 7 5th Grade -0.147483 8 6th Grade -0.127541 9 7th Grade -0.083614 10 8th Grade -0.138195 11 9th Grade -0.291444 0 10th Grade -0.425920 1 11th Grade -0.028476 2 12th Grade -0.300186 Correlation between School Year and Scale Score Average Grouped by Grade: Grade Correlation 12 All Students 0.218727 13 Kindergarten 0.094683 3 1st Grade -0.249054 4 2nd Grade -0.217546 5 3rd Grade -0.038276 6 4th Grade -0.090161 7 5th Grade -0.166222 8 6th Grade 0.038423 9 7th Grade 0.020969 10 8th Grade -0.075884 11 9th Grade -0.089543 0 10th Grade -0.076785 1 11th Grade -0.362966 2 12th Grade 0.037264
/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_69223/2901388397.py:232: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. year_prof_corr_content = df.groupby("ContentArea").apply(lambda x: x["PctProficient"].corr(x["School Year"])) /var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_69223/2901388397.py:239: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. year_score_corr_content = df.groupby("ContentArea").apply(lambda x: x["ScaleScoreAvg"].corr(x["School Year"])) /var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_69223/2901388397.py:246: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. year_prof_corr_grade = df.groupby("Grade").apply(lambda x: x["PctProficient"].corr(x["School Year"])) /var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_69223/2901388397.py:253: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. year_score_corr_grade = df.groupby("Grade").apply(lambda x: x["ScaleScoreAvg"].corr(x["School Year"]))