import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from scipy.stats import pointbiserialr
df = pd.read_csv("Dataset of Diabetes .csv")

/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_48129/3095353859.py:2: DeprecationWarning: 
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd

df = df.drop(columns=['HDL', 'LDL', 'VLDL', 'HbA1c','ID', "No_Pation"])

# Preprocess the data to ensure consistency
df['CLASS'] = df['CLASS'].str.strip().str.upper()
df['Gender'] = df['Gender'].str.strip().str.upper()
# Initialize LabelEncoder
label_encoder = LabelEncoder()
# Fit the encoder to the categorical column
label_encoder.fit(df['CLASS'])
# Transform the categorical column
df['CLASS_encoded'] = label_encoder.transform(df['CLASS'])

#detect missing values
missing_values = df.isnull().sum()
print("Missing values:\n", missing_values)

Missing values:
 Gender           0
AGE              0
Urea             0
Cr               0
Chol             0
TG               0
BMI              0
CLASS            0
CLASS_encoded    0
dtype: int64

#There are no missing values, can move on to outlier analysis
#outlier anlysis on Urea
column = df['Urea']
Q1 = column.quantile(0.25)
Q3 = column.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
urea_outliers = column[(column < lower_bound) | (column > upper_bound)]
print("The outliers are:\nIndex", " Value\n", urea_outliers)

The outliers are:
Index  Value
 20     13.5
86     10.0
91      9.6
95     22.0
151    17.1
       ... 
977    10.3
983     8.8
985    10.3
994    10.3
995    11.0
Name: Urea, Length: 65, dtype: float64

#outlier anlysis on Cr
column = df['Cr']
Q1 = column.quantile(0.25)
Q3 = column.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
cr_outliers = column[(column < lower_bound) | (column > upper_bound)]
print("The outliers are:\nIndex", " Value\n", cr_outliers)

The outliers are:
Index  Value
 20     175
69       6
87     123
91     203
94     132
95     159
151    344
208    370
212    370
273    800
283    800
303    146
309    159
310    146
316    159
323    136
326    315
331    136
336    315
406    111
407    111
502    243
505    179
516    120
521    130
533    145
589    401
590    401
592    401
602    112
648    139
649    139
650    139
682    120
709    230
806    126
807    327
846    800
855    198
860    800
892    228
917    111
944    132
951    168
959    114
961    194
969    150
972    113
974    185
977    185
985    113
994    185
Name: Cr, dtype: int64

#outlier anlysis on Chol
column = df['Chol']
Q1 = column.quantile(0.25)
Q3 = column.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
chol_outliers = column[(column < lower_bound) | (column > upper_bound)]
print("The outliers are:\nIndex", " Value\n", chol_outliers)

The outliers are:
Index  Value
 20      0.5
41      9.5
48      9.5
99      0.0
176     9.5
177     9.5
342     0.6
345     0.6
412     8.5
415     8.5
431     8.8
523     9.9
524     9.8
526     9.3
621     8.8
662    10.3
667     9.7
709     9.1
724     1.2
742     8.4
776     9.7
821     9.2
826     8.1
830     8.0
831     8.0
921     9.8
970     8.6
Name: Chol, dtype: float64

#outlier anlysis on TG
column = df['TG']
Q1 = column.quantile(0.25)
Q3 = column.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
tg_outliers = column[(column < lower_bound) | (column > upper_bound)]
print("The outliers are:\nIndex", " Value\n", tg_outliers)

The outliers are:
Index  Value
 64      5.9
91      5.9
92      5.3
98      5.3
108     5.3
149     5.3
208     6.0
212     6.0
302     5.1
315     6.8
319     6.8
426     5.1
436     5.3
438     5.1
439     5.1
482     5.1
509     5.1
521     5.4
526     5.1
528     5.3
579     5.8
583     6.7
595     5.1
604     5.1
609     5.1
612     5.1
626     5.1
628     5.1
629     5.1
632     7.0
633     5.1
635     7.0
637     5.9
639     7.0
667    13.8
687     5.1
697     5.1
700     5.1
723     7.2
742     6.3
756     7.7
768     5.5
771     5.5
773     5.5
776    12.7
796     5.8
802     7.7
807     8.7
811     8.5
838     6.8
882     6.7
896    11.6
933     7.2
969     5.3
975     5.7
Name: TG, dtype: float64

#outlier anlysis on BMI
column = df['BMI']
Q1 = column.quantile(0.25)
Q3 = column.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
bmi_outliers = column[(column < lower_bound) | (column > upper_bound)]
print("The outliers are:\nIndex", " Value\n", bmi_outliers)

The outliers are:
Index  Value
 183    47.00
188    47.00
698    47.75
Name: BMI, dtype: float64

print("Below is a summary of the number of outliers in each column\n")
print("Urea Outliers:",len(urea_outliers))
print("Cr Outliers:",len(cr_outliers))
print("Chol Outliers:",len(chol_outliers))
print("TG Outliers:",len(tg_outliers))
print("BMI Outliers:",len(bmi_outliers))

Below is a summary of the number of outliers in each column

Urea Outliers: 65
Cr Outliers: 52
Chol Outliers: 27
TG Outliers: 55
BMI Outliers: 3

for column in df.select_dtypes(include=['object']):
    print("Summary for", column)
    print(df[column].value_counts())
    print()

# Graphical representation
plt.figure(figsize=(12, 8))

# Bar plot for each categorical column
for i, column in enumerate(df.select_dtypes(include=['object'])):
    plt.subplot(2, 2, i+1)
    sns.countplot(data=df, x=column)
    plt.title(f'Distribution of {column}')
    plt.xlabel(column)
    plt.ylabel('Count')

plt.tight_layout()
plt.show()

Summary for Gender
Gender
M    565
F    435
Name: count, dtype: int64

Summary for CLASS
CLASS
Y    844
N    103
P     53
Name: count, dtype: int64

# Statistical summary

summary =  df.drop(columns=['CLASS_encoded']).describe()
# Boxplot
print("Statistical Summary:\n", summary)
columns_to_include = ['AGE',"Urea", 'Cr', 'Chol', 'TG', 'BMI']
for column in df[columns_to_include]:
    plt.figure(figsize=(6, 4))
    plt.boxplot(df[column])
    plt.title(f'Boxplot of {column}')
    plt.ylabel('Values')
    plt.show()



# Histogram
for column in df[columns_to_include]:
    plt.figure(figsize=(8, 6))
    plt.hist(df[column], bins=10)  # Adjust the number of bins as needed
    plt.title(f'Histogram of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

Statistical Summary:
                AGE         Urea           Cr         Chol           TG  \
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000   
mean     53.528000     5.124743    68.943000     4.862820     2.349610   
std       8.799241     2.935165    59.984747     1.301738     1.401176   
min      20.000000     0.500000     6.000000     0.000000     0.300000   
25%      51.000000     3.700000    48.000000     4.000000     1.500000   
50%      55.000000     4.600000    60.000000     4.800000     2.000000   
75%      59.000000     5.700000    73.000000     5.600000     2.900000   
max      79.000000    38.900000   800.000000    10.300000    13.800000   

               BMI  
count  1000.000000  
mean     29.578020  
std       4.962388  
min      19.000000  
25%      26.000000  
50%      30.000000  
75%      33.000000  
max      47.750000

#Correlation Between Cholesterol & Diabetes
#Class_encoded Values: 0 = Non-Diabetic, 1=Predict Diabetic, 2=Diabetic 
correlation_coefficient, p_value = pointbiserialr(df['CLASS_encoded'], df['Chol'])
print("Point-biserial correlation coefficient:", correlation_coefficient)
print("p-value:", p_value)

#Correlation Between Age & Chol
column1 = 'AGE'  
column2 = 'Chol' 
correlation_coefficient = np.corrcoef(df['AGE'], df['Chol'])[0, 1]
print("Pearson correlation coefficient:", correlation_coefficient)

#Correlation Between Cr & Urea
column1 = 'Urea'  
column2 = 'Cr' 
correlation_coefficient = np.corrcoef(df['Urea'], df['Cr'])[0, 1]
print("Pearson correlation coefficient:", correlation_coefficient)

Point-biserial correlation coefficient: 0.16737461648529126
p-value: 1.0160572504528455e-07
Pearson correlation coefficient: 0.0366491842236855
Pearson correlation coefficient: 0.62413401439751

Math219 Project 1¶

Introduction¶

Preprocessing¶

Summary Data Analysis¶

Discussion¶