#prepare tools for analysis
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_56975/1477624756.py:2: DeprecationWarning: 
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd

#import the data and prepare it for analysis
adult_raw=pd.read_csv('adult.data')
adult_raw.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race','sex', 'capital_gain', 
                 'capital_loss', 'hours_per_week', 'native_country', 'income']
num_rows, num_columns = adult_raw.shape
print("Number of rows:", num_rows)
print("Number of columns:", num_columns)
adult_raw.head()

Number of rows: 32560
Number of columns: 15

#remove rows with missing values
adult_raw.dropna(inplace=True)
#remove columns that are not useful or not interesting for analysis
adult = adult_raw[['age', 'workclass', 'education', 'education_num', 'occupation', 'capital_gain','capital_loss','hours_per_week', 'native_country', 'income']]
print(adult.shape)
adult.head()

(32560, 10)

# Convert columns to numeric
adult_quant = adult[['age', 'education_num','capital_gain', 'capital_loss', 'hours_per_week']].apply(pd.to_numeric)
# Perform outlier detection
def is_outlier(x):
    Q25, Q75 = x.quantile([.25,.75])
    I = Q75 - Q25
    return (x < Q25 - 1.5*I) |  (x > Q75 + 1.5*I)
outliers = adult_quant.apply(is_outlier)
print(outliers.sum())
outliers_sum = outliers.sum()
#To find out the reason why there are so many outliers in hours_per_week, we can use a boxplot to visualize the distribution of hours_per_week
sns.boxplot(x=adult_quant['hours_per_week'])
plt.xlabel('hours_per_week')
plt.title('Distribution of hours_per_week')
plt.show()

age                143
education_num     1198
capital_gain      2711
capital_loss      1519
hours_per_week    9008
dtype: int64

/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/categorical.py:640: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  positions = grouped.grouper.result_index.to_numpy(dtype=float)

# Numerical summary
summary = adult.describe()
quantiles = adult_quant.quantile([0.25, 0.5, 0.75])
print(summary)
print('\n')
print(quantiles, '\n')



# Graphical summary
adult.hist(figsize=(10, 10))
plt.show()

                age  education_num  capital_gain  capital_loss  hours_per_week
count  32560.000000   32560.000000  32560.000000  32560.000000    32560.000000
mean      38.581634      10.080590   1077.615172     87.306511       40.437469
std       13.640642       2.572709   7385.402999    402.966116       12.347618
min       17.000000       1.000000      0.000000      0.000000        1.000000
25%       28.000000       9.000000      0.000000      0.000000       40.000000
50%       37.000000      10.000000      0.000000      0.000000       40.000000
75%       48.000000      12.000000      0.000000      0.000000       45.000000
max       90.000000      16.000000  99999.000000   4356.000000       99.000000


       age  education_num  capital_gain  capital_loss  hours_per_week
0.25  28.0            9.0           0.0           0.0            40.0
0.50  37.0           10.0           0.0           0.0            40.0
0.75  48.0           12.0           0.0           0.0            45.0

#Explore correlations between quantitative columns numerically
correlation = adult_quant.corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm',vmin=-1, vmax=1)
print(correlation)

                     age  education_num  capital_gain  capital_loss  \
age             1.000000       0.036527      0.077674      0.057775   
education_num   0.036527       1.000000      0.122627      0.079932   
capital_gain    0.077674       0.122627      1.000000     -0.031614   
capital_loss    0.057775       0.079932     -0.031614      1.000000   
hours_per_week  0.068756       0.148127      0.078409      0.054256   

                hours_per_week  
age                   0.068756  
education_num         0.148127  
capital_gain          0.078409  
capital_loss          0.054256  
hours_per_week        1.000000

	age	workclass	fnlwgt	education	education_num	marital_status	occupation	relationship	race	sex	hours_per_week	native_country	income
0	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	13	United-States	<=50K
1	38	Private	215646	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	40	United-States	<=50K
2	53	Private	234721	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	40	United-States	<=50K
3	28	Private	338409	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	40	Cuba	<=50K
4	37	Private	284582	Masters	14	Married-civ-spouse	Exec-managerial	Wife	White	Female	40	United-States	<=50K

	age	workclass	education	education_num	occupation	hours_per_week	native_country	income
0	50	Self-emp-not-inc	Bachelors	13	Exec-managerial	13	United-States	<=50K
1	38	Private	HS-grad	9	Handlers-cleaners	40	United-States	<=50K
2	53	Private	11th	7	Handlers-cleaners	40	United-States	<=50K
3	28	Private	Bachelors	13	Prof-specialty	40	Cuba	<=50K
4	37	Private	Masters	14	Exec-managerial	40	United-States	<=50K

1.Introduction¶

2.Preprocessing¶

3 Summary Data Analysis¶

4.Discussion¶

1.Prediction of a categorical outcome¶

2.Prediction of a quantitative outcome¶