Project 1¶
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_56070/2660915427.py:2: DeprecationWarning: Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0), (to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries) but was not found to be installed on your system. If this would cause problems for you, please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466 import pandas as pd
Introduction¶
This dataset comes from Delaware Open Data
The Educator Avegare Salary dataset "...contains average total, federal, state, and local salaries for full-time staff. In addition, this file provides the number of full-time equivalent positions in each organization, average years of experience, as well as average years of age. Data is aggregated at the state, district, and school level" (Educator Average Salary).
salaries = pd.read_csv('Educator_Average_Salary_20240314.csv')
salaries.head()
School Year | District Code | District | School Code | Organization | Race | Gender | Grade | SpecialDemo | Geography | ... | Staff Category | Job Classification | Experience | Educators (FTE) | Average Total Salary | Average State Salary | Average Local Salary | Average Federal Salary | Average Years of Experience | Average Years of Age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020 | 24 | Smyrna School District | 697 | Smyrna Administrative Office | White | All Educators | All Educators | Regular | All Educators | ... | Official/Administrative | ALL | ALL | 14.0 | 95625.22 | 54023.65 | 41601.56 | NaN | 19 | 53 |
1 | 2020 | 24 | Smyrna School District | 697 | Smyrna Administrative Office | White | All Educators | All Educators | Regular | All Educators | ... | Pupil Support | ALL | ALL | 1.0 | 97917.55 | 60056.88 | 37860.68 | NaN | 1 | 33 |
2 | 2020 | 24 | Smyrna School District | 697 | Smyrna Administrative Office | White | All Educators | All Educators | All Educators | All Educators | ... | ALL | ALL | ALL | 25.0 | 48982.63 | 35892.53 | 13090.10 | NaN | 20 | 46 |
3 | 2020 | 24 | Smyrna School District | 697 | Smyrna Administrative Office | White | All Educators | All Educators | Regular | All Educators | ... | Instructional Support | Supervisor, Instructional | ALL | 3.0 | 115362.61 | 64883.95 | 50478.65 | NaN | 12 | 53 |
4 | 2020 | 24 | Smyrna School District | 697 | Smyrna Administrative Office | White | All Educators | All Educators | All Educators | All Educators | ... | ALL | ALL | ALL | 46.0 | 69886.27 | 44649.62 | 25236.64 | NaN | 19 | 48 |
5 rows × 22 columns
Preprocessing¶
#Drop unimportant columns
salaries = salaries.drop(columns = ['School Year', 'District Code', 'Geography', 'Organization', 'Grade', 'Average Federal Salary', 'Job Classification', 'Experience', 'Educators (FTE)', 'School Code'], axis = 1)
#Drop rows with missing values
salaries = salaries.dropna()
#Drop rows with "All Educators" and "ALL" as a value in all columns
salaries = salaries[salaries['Race'] != 'All Educators']
salaries = salaries[salaries['Gender'] != 'All Educators']
salaries = salaries[salaries['SpecialDemo'] != 'All Educators']
salaries = salaries[salaries['Staff Category'] != 'ALL']
#Remove columns where Average Years of Age is above 90
salaries = salaries[salaries['Average Years of Age'] < 90]
#Show data after preprocessing
salaries
District | Race | Gender | SpecialDemo | SubGroup | Staff Type | Staff Category | Average Total Salary | Average State Salary | Average Local Salary | Average Years of Experience | Average Years of Age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
81 | Smyrna School District | White | Female | Regular | White/Female/Regular School | Non-Professional | Skilled & Service Workers | 36488.92 | 36228.66 | 260.26 | 11 | 62 |
82 | Smyrna School District | White | Female | Regular | White/Female/Regular School | Professional | Official/Administrative | 111156.76 | 48856.86 | 62299.90 | 18 | 63 |
87 | Smyrna School District | White | Female | Regular | White/Female/Regular School | Professional | Instructional Support | 118001.52 | 67360.28 | 50641.24 | 10 | 59 |
89 | Smyrna School District | White | Female | Regular | White/Female/Regular School | Non-Professional | Skilled & Service Workers | 18626.92 | 18366.92 | 260.00 | 5 | 43 |
90 | Smyrna School District | White | Female | Regular | White/Female/Regular School | Professional | Official/Administrative | 141327.94 | 80867.80 | 60460.14 | 9 | 41 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2394070 | Sussex Montessori School | African American | Female | Charter | African American/Female/Charter School | Professional | Official/Administrative | 48125.00 | 80000.00 | 16250.00 | 5 | 75 |
2394192 | Sussex Montessori School | White | Female | Charter | White/Female/Charter School | Professional | Classroom Teacher | 62172.58 | 59762.03 | 33747.74 | 14 | 84 |
2394214 | Sussex Montessori School | White | Female | Charter | White/Female/Charter School | Professional | Classroom Teacher | 63456.74 | 60388.76 | 33747.74 | 14 | 81 |
2394235 | Sussex Montessori School | White | Female | Charter | White/Female/Charter School | Professional | Classroom Teacher | 66453.93 | 59704.38 | 33747.74 | 19 | 55 |
2394245 | Sussex Montessori School | White | Female | Charter | White/Female/Charter School | Professional | Classroom Teacher | 67044.94 | 61420.32 | 33747.74 | 22 | 67 |
252162 rows × 12 columns
Outlier Analysis¶
How many outliers are there per quantitative column?
def count_outliers(column):
Q25 = column.quantile(0.25)
Q75 = column.quantile(0.75)
IQR = Q75 - Q25
lower_bound = Q25 - 1.5 * IQR
upper_bound = Q75 + 1.5 * IQR
outliers = column[(column < lower_bound) | (column > upper_bound)]
return len(outliers)
#Outliers of Average Total Salary
print("Average Total Salary: \nTotal number of salaries: ")
print(len(salaries['Average Total Salary']))
av_total = count_outliers(salaries['Average Total Salary'])
print("Number of outliers: ")
print(av_total)
print("\n")
#Outliers of Average State Salary
print("Average State Salary: \nTotal number of salaries: ")
print(len(salaries['Average State Salary']))
av_state = count_outliers(salaries['Average State Salary'])
print("Number of outliers: ")
print(av_state)
print("\n")
#Outliers of Average Local Salary
print("Average Local Salary: \nTotal number of salaries: ")
print(len(salaries['Average Local Salary']))
av_local = count_outliers(salaries['Average Local Salary'])
print("Number of outliers: ")
print(av_local)
print("\n")
#Outliers of Average Years of Experience
print("Average Years of Experience: \nTotal number of employees: ")
print(len(salaries['Average Years of Experience']))
av_exp = count_outliers(salaries['Average Years of Experience'])
print("Number of outliers: ")
print(av_exp)
print("\n")
#Outliers of Average Years of Age
print("Average Years of Age: \nTotal number of employees: ")
print(len(salaries['Average Years of Age']))
av_age = count_outliers(salaries['Average Years of Age'])
print("Number of outliers: ")
print(av_age)
Average Total Salary: Total number of salaries: 252162 Number of outliers: 6734 Average State Salary: Total number of salaries: 252162 Number of outliers: 3582 Average Local Salary: Total number of salaries: 252162 Number of outliers: 12618 Average Years of Experience: Total number of employees: 252162 Number of outliers: 1719 Average Years of Age: Total number of employees: 252162 Number of outliers: 1845
What is the mean of each quantitative column with/without the outliers?
def remove_outliers(df, column):
Q25 = df[column].quantile(0.25)
Q75 = df[column].quantile(0.75)
IQR = Q75 - Q25
lower_bound = Q25 - 1.5 * IQR
upper_bound = Q75 + 1.5 * IQR
df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
return df
#Mean of Average Total Salary
mean_total = salaries['Average Total Salary'].mean()
print("Mean of Average Total Salary: ")
print(mean_total)
print("Mean of Average Total Salary without outliers: ")
no_outliers_total = remove_outliers(salaries, 'Average Total Salary')
print(no_outliers_total['Average Total Salary'].mean())
print("\n")
#Mean of Average State Salary
mean_state = salaries['Average State Salary'].mean()
print("Mean of Average State Salary: ")
print(mean_state)
print("Mean of Average State Salary without outliers: ")
no_outliers_state = remove_outliers(salaries, 'Average State Salary')
print(no_outliers_state['Average State Salary'].mean())
print("\n")
#Mean of Average Local Salary
mean_local = salaries['Average Local Salary'].mean()
print("Mean of Average Local Salary: ")
print(mean_local)
print("Mean of Average Local Salary without outliers: ")
no_outliers_local = remove_outliers(salaries, 'Average Local Salary')
print(no_outliers_local['Average Local Salary'].mean())
print("\n")
#Mean of Average Years of Experience
mean_exp = salaries['Average Years of Experience'].mean()
print("Mean of Average Years of Experience: ")
print(mean_exp)
print("Mean of Average Years of Experience without outliers: ")
no_outliers_exp = remove_outliers(salaries, 'Average Years of Experience')
print(no_outliers_exp['Average Years of Experience'].mean())
print("\n")
#Mean of Average Years of Age
mean_age = salaries['Average Years of Age'].mean()
print("Mean of Average Years of Age: ")
print(mean_age)
print("Mean of Average Years of Age without outliers: ")
no_outliers_age = remove_outliers(salaries, 'Average Years of Age')
print(no_outliers_age['Average Years of Age'].mean())
Mean of Average Total Salary: 61888.90926900167 Mean of Average Total Salary without outliers: 59646.94134719754 Mean of Average State Salary: 39408.56654012103 Mean of Average State Salary without outliers: 38795.52753210234 Mean of Average Local Salary: 23875.800026490906 Mean of Average Local Salary without outliers: 21474.23220435494 Mean of Average Years of Experience: 14.356937207033573 Mean of Average Years of Experience without outliers: 14.16898855228535 Mean of Average Years of Age: 44.64913032098413 Mean of Average Years of Age without outliers: 44.544781217416315
What is the median of each quantitative column with/without outliers?
#Median of Average Total Salary
median_total = salaries['Average Total Salary'].median()
print("Median of Average Total Salary: ")
print(median_total)
print("Median of Average Total Salary without outliers: ")
print(no_outliers_total['Average Total Salary'].median())
print("\n")
#Median of Average State Salary
median_state = salaries['Average State Salary'].median()
print("Median of Average State Salary: ")
print(median_state)
print("Median of Average State Salary without outliers: ")
print(no_outliers_state['Average State Salary'].median())
print("\n")
#Median of Average Local Salary
median_local = salaries['Average Local Salary'].median()
print("Median of Average Local Salary: ")
print(median_local)
print("Median of Average Local Salary without outliers: ")
print(no_outliers_local['Average Local Salary'].median())
print("\n")
#Median of Average Years of Experience
median_exp = salaries['Average Years of Experience'].median()
print("Median of Average Years of Experience: ")
print(median_exp)
print("Median of Average Years of Experience without outliers: ")
print(no_outliers_exp['Average Years of Experience'].median())
print("\n")
#Median of Average Years of Age
median_age = salaries['Average Years of Age'].median()
print("Median of Average Years of Age: ")
print(median_age)
print("Median of Average Years of Age without outliers: ")
print(no_outliers_age['Average Years of Age'].median())
Median of Average Total Salary: 58078.255 Median of Average Total Salary without outliers: 57059.08 Median of Average State Salary: 38071.67 Median of Average State Salary without outliers: 37766.03999999999 Median of Average Local Salary: 19985.254999999997 Median of Average Local Salary without outliers: 19137.82 Median of Average Years of Experience: 14.0 Median of Average Years of Experience without outliers: 14.0 Median of Average Years of Age: 45.0 Median of Average Years of Age without outliers: 45.0
#Number of rows from each district
districts = salaries['District'].value_counts()
print("Number of rows from each district: ")
print(districts)
Number of rows from each district: District Christina School District 32210 Red Clay Consolidated School District 30311 Brandywine School District 20579 Colonial School District 19747 Appoquinimink School District 19426 Indian River School District 16503 Capital School District 15618 Caesar Rodney School District 14592 Cape Henlopen School District 11805 Smyrna School District 10115 State of Delaware 9055 Seaford School District 8654 Lake Forest School District 7768 New Castle County Vocational-Technical School District 7507 Milford School District 7417 Woodbridge School District 5723 Laurel School District 5011 POLYTECH School District 2840 Sussex Technical School District 2744 Delmar School District 2569 Odyssey Charter School 316 Newark Charter School 292 Las Americas Aspira Academy 230 Providence Creek Academy Charter School 204 Academia Antonia Alonso 204 East Side Charter School 150 Consortium Discipline Alternative Programs 56 Academy of Dover Charter School 54 Charter School of New Castle 50 Edison (Thomas A.) Charter School 46 Sussex Academy 44 First State Montessori Academy 40 Kuumba Academy Charter School 32 Delaware Academy of Public Safety and Security 32 Gateway Lab School 28 Campus Community Charter School 28 Early College High School at Del State 24 Sussex Academy of Arts & Sciences 20 Freire Charter School Wilmington 16 Campus Community School 16 MOT Charter School 16 Sussex Montessori School 12 Family Foundations Academy 10 Moyer (Maurice J.) Academy 8 Reach Academy for Girls 8 First State Military Academy 8 Gateway Charter School 8 Great Oaks Charter School 8 Positive Outcomes Charter School 8 Name: count, dtype: int64
#Bar graph of races of educators
sns.countplot(y = 'Race', data = salaries)
plt.title('Race of Educators')
plt.show()
#Bar graph of genders of educators
sns.countplot(x = 'Gender', data = salaries)
plt.title('Gender of Educators')
plt.show()
#Overall type of school
sns.countplot(x = 'SpecialDemo', data = salaries)
plt.title('Overall Type of School')
plt.show()
#Specific type of school
subgroup = salaries['SubGroup'].value_counts()
print("Number of rows from each subgroup: ")
print(subgroup)
Number of rows from each subgroup: SubGroup White/Female/Regular School 86961 White/Male/Regular School 48223 African American/Female/Regular School 38551 African American/Male/Regular School 26715 Hispanic/Latino/Female/Regular School 16581 Hispanic/Latino/Male/Regular School 7790 Multi-Racial/Female/Regular School 6957 Asian American/Female/Regular School 6914 Asian American/Male/Regular School 3306 Multi-Racial/Male/Regular School 3000 Native American/Female/Regular School 2172 Native American/Male/Regular School 1315 White/Female/Charter School 1085 Native Hawaiian/Pacific Islander/Female/Regular School 587 African American/Female/Charter School 492 Hispanic/Latino/Female/Charter School 427 White/Male/Charter School 401 African American/Male/Charter School 191 Native Hawaiian/Pacific Islander/Male/Regular School 181 Hispanic/Latino/Male/Charter School 131 Asian American/Female/Charter School 96 White/Male/Service School 48 White/Female/Service School 36 Multi-Racial/Female/Charter School 2 Name: count, dtype: int64
#Graph of type of staff types
sns.countplot(y = 'Staff Type', data = salaries)
plt.title('Staff Type')
plt.show()
#Types of staff categories
staff_cat = salaries['Staff Category'].value_counts()
print("Number of rows from each staff category: ")
print(staff_cat)
Number of rows from each staff category: Staff Category Skilled & Service Workers 91151 Classroom Teacher 73516 Official/Administrative 31952 Pupil Support 28821 Instructional Support 26722 Name: count, dtype: int64
#Graph of Average Total Salary
sns.histplot(data=salaries, x='Average Total Salary', kde=True)
plt.title('Bell Curve of Average Total Salary')
plt.xlabel('Average Total Salary')
plt.ylabel('Frequency')
plt.xlim(0, 200000) # Set the x-axis limits
plt.show()
#Graph of Average State Salary
sns.histplot(data=salaries, x='Average State Salary', kde=True)
plt.title('Bell Curve of Average State Salary')
plt.xlabel('Average State Salary')
plt.ylabel('Frequency')
plt.xlim(0, 110000) # Set the x-axis limits
plt.show()
#Graph of Average Local Salary
sns.histplot(data=salaries, x='Average Local Salary', kde=True)
plt.title('Bell Curve of Average Local Salary')
plt.xlabel('Average Local Salary')
plt.ylabel('Frequency')
plt.xlim(0, 110000) # Set the x-axis limits
plt.show()
#Bar Graph of Average Years of Experience
sns.histplot(data=salaries, x='Average Years of Experience', bins=50)
plt.title('Bar Graph of Average Years of Experience')
plt.xlabel('Average Years of Experience')
plt.ylabel('Count')
plt.xticks(np.arange(0, 55, 5)) # Increment x-axis by 5
plt.xlim(0, 50) # Set the x-axis limits
plt.show()
#Bar Graph of Average Years of Age
sns.histplot(data=salaries, x='Average Years of Age', bins=50)
plt.title('Bar Graph of Average Years of Age')
plt.xlabel('Average Years of Age')
plt.ylabel('Count')
plt.xticks(np.arange(0, 85, 5)) # Increment x-axis by 5
plt.xlim(15, 85) # Set the x-axis limits
plt.show()
Correlations between quantative columns:¶
The following types of correlation are determined by referencing tastylive.com
Average Years of Age vs Average Years of Experience (without outliers)
As expressed below, there is a strong positive correlation between Average Years of Age and Average Years of Experience.
#What is the correlation between the Average Years of Experience and the Average Years of Age without outliers?
correlation = no_outliers_exp['Average Years of Experience'].corr(no_outliers_age['Average Years of Age'])
correlation
0.6459053973070046
#Graph correlation between Average Years of Experience and Average Years of Age
sns.scatterplot(data=no_outliers_exp, x='Average Years of Experience', y='Average Years of Age')
plt.title('Correlation between Average Years of Experience and Average Years of Age')
plt.xlabel('Average Years of Experience')
plt.ylabel('Average Years of Age')
plt.show()
Average Years of Experience vs Average State Salary (without outliers)
As expressed below, there is a very weak positive correlation between Average Years of Experience and Average State Salary.
#What is the correlation between the Average State Salary and the Average Years of Experience?
correlation = no_outliers_state['Average State Salary'].corr(no_outliers_exp['Average Years of Experience'])
correlation
0.30746354921640684
#Graph correlation between Average State Salary and Average Years of Experience
sns.relplot(data=no_outliers_state, x='Average State Salary', y='Average Years of Experience', kind = 'scatter')
plt.title('Correlation between Average State Salary and Average Years of Experience')
plt.xlabel('Average State Salary')
plt.ylabel('Average Years of Experience')
plt.show()
Average Years of Age vs Average State Salary (without outliers)
As expressed below, there is no correlation between Average Years of Age and Average State Salary.
#What is the correlation between the Average Years of Age and the Average State Salary?
correlation = no_outliers_age['Average Years of Age'].corr(no_outliers_state['Average State Salary'])
correlation
0.08432497989189496
#Graph correlation between Average Years of Age and Average State Salary
sns.relplot(data=no_outliers_age, x='Average Years of Age', y='Average State Salary', kind = 'scatter')
plt.title('Correlation between Average Years of Age and Average State Salary')
plt.xlabel('Average Years of Age')
plt.ylabel('Average State Salary')
plt.ylim(0, 150000)
plt.show()