import pandas as pd
df = pd.read_excel('https://query.data.world/s/h5xr3mwavgls7t2g65zgtpgy2yvz4y?dws=00000')

df

/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_57586/299688920.py:1: DeprecationWarning: 
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd

# Imports

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

columns = ["NDB_No", "Shrt_Desc", "Water_(g)", "Energ_Kcal", "Protein_(g)", "Lipid_Tot_(g)", "Carbohydrt_(g)", "Fiber_TD_(g)"]
df = df[columns]
df.shape

(8790, 8)

df.dropna(inplace=True)
df.shape

/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_57586/2053667463.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)

(8195, 8)

df["has_Protein"] = df["Protein_(g)"] > 0
df

/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_57586/2696937032.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["has_Protein"] = df["Protein_(g)"] > 0

def is_outlier(x):
    Q25, Q75 = x.quantile([.25,.75])
    I = Q75 - Q25
    return (x < Q25 - 1.5*I) |  (x > Q75 + 1.5*I)

outliers_water = is_outlier(df["Water_(g)"])
outliers_water.sum()

0

outliers_energy = is_outlier(df["Energ_Kcal"])
outliers_energy.sum()

136

outliers_protein = is_outlier(df["Protein_(g)"])
outliers_protein.sum()

41

outliers_lipid = is_outlier(df["Lipid_Tot_(g)"])
outliers_lipid.sum()

454

outliers_carb = is_outlier(df["Carbohydrt_(g)"])
outliers_carb.sum()

25

outliers_fiber = is_outlier(df["Fiber_TD_(g)"])
outliers_fiber.sum()

734

# Water
df["Water_(g)"].describe()

count    8195.000000
mean       53.626142
std        30.808801
min         0.000000
25%        28.255000
50%        62.670000
75%        77.200000
max       100.000000
Name: Water_(g), dtype: float64

sns.displot(df["Water_(g)"], kind="ecdf");

sns.displot(df["Water_(g)"], kind="hist");

# Energy
df["Energ_Kcal"].describe()

count    8195.000000
mean      228.613179
std       169.683531
min         0.000000
25%        95.000000
50%       193.000000
75%       341.000000
max       902.000000
Name: Energ_Kcal, dtype: float64

sns.displot(df["Energ_Kcal"], kind="ecdf");

sns.displot(df["Energ_Kcal"], kind="hist");

# Protein
df["Protein_(g)"].describe()

count    8195.000000
mean       11.419850
std        10.516845
min         0.000000
25%         2.400000
50%         8.180000
75%        20.050000
max        88.320000
Name: Protein_(g), dtype: float64

sns.displot(df["Protein_(g)"], kind="ecdf");

sns.displot(df["Protein_(g)"], kind="hist");

# Total Lipid
df["Lipid_Tot_(g)"].describe()

count    8195.000000
mean       10.660617
std        15.803175
min         0.000000
25%         1.050000
50%         5.330000
75%        13.875000
max       100.000000
Name: Lipid_Tot_(g), dtype: float64

sns.displot(df["Lipid_Tot_(g)"], kind="ecdf");

sns.displot(df["Lipid_Tot_(g)"], kind="hist");

# Carbohydrates
df["Carbohydrt_(g)"].describe()

count    8195.000000
mean       22.435128
std        27.494874
min         0.000000
25%         0.030000
50%         9.570000
75%        37.640000
max       100.000000
Name: Carbohydrt_(g), dtype: float64

sns.displot(df["Carbohydrt_(g)"], kind="ecdf");

sns.displot(df["Carbohydrt_(g)"], kind="hist");

# Fiber
df["Fiber_TD_(g)"].describe()

count    8195.000000
mean        2.187126
std         4.383311
min         0.000000
25%         0.000000
50%         0.700000
75%         2.600000
max        79.000000
Name: Fiber_TD_(g), dtype: float64

sns.displot(df["Fiber_TD_(g)"], kind="ecdf");

sns.displot(df["Fiber_TD_(g)"], kind="hist");

columns1 = ["Protein_(g)", "Water_(g)"]
sns.pairplot(data=df[columns1])

<seaborn.axisgrid.PairGrid at 0x127b6e690>

df[columns1].corr()

df[columns1].corr("spearman")

columns2 = ["Energ_Kcal", "Carbohydrt_(g)"]
sns.pairplot(data=df[columns2])

<seaborn.axisgrid.PairGrid at 0x127a02a90>

df[columns2].corr()

df[columns2].corr("spearman")

columns3 = ["Protein_(g)", "Fiber_TD_(g)"]
sns.pairplot(data=df[columns3])

<seaborn.axisgrid.PairGrid at 0x13018ab90>

df[columns3].corr()

df[columns3].corr("spearman")

	NDB_No	Shrt_Desc	Water_(g)	Energ_Kcal	Protein_(g)	Lipid_Tot_(g)	Ash_(g)	Carbohydrt_(g)	Fiber_TD_(g)	Sugar_Tot_(g)	...	Vit_K_(µg)	FA_Sat_(g)	FA_Mono_(g)	FA_Poly_(g)	Cholestrl_(mg)	GmWt_1	GmWt_Desc1	GmWt_2	GmWt_Desc2	Refuse_Pct
0	1001	BUTTER,WITH SALT	15.87	717	0.85	81.11	2.11	0.06	0.0	0.06	...	7.0	51.368	21.021	3.043	215.0	5.00	1 pat, (1" sq, 1/3" high)	14.2	1 tbsp	0.0
1	1002	BUTTER,WHIPPED,W/ SALT	16.72	718	0.49	78.30	1.62	2.87	0.0	0.06	...	4.6	45.390	19.874	3.331	225.0	3.80	1 pat, (1" sq, 1/3" high)	9.4	1 tbsp	0.0
2	1003	BUTTER OIL,ANHYDROUS	0.24	876	0.28	99.48	0.00	0.00	0.0	0.00	...	8.6	61.924	28.732	3.694	256.0	12.80	1 tbsp	205.0	1 cup	0.0
3	1004	CHEESE,BLUE	42.41	353	21.40	28.74	5.11	2.34	0.0	0.50	...	2.4	18.669	7.778	0.800	75.0	28.35	1 oz	17.0	1 cubic inch	0.0
4	1005	CHEESE,BRICK	41.11	371	23.24	29.68	3.18	2.79	0.0	0.51	...	2.5	18.764	8.598	0.784	94.0	132.00	1 cup, diced	113.0	1 cup, shredded	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
8785	83110	MACKEREL,SALTED	43.00	305	18.50	25.10	13.40	0.00	0.0	0.00	...	7.8	7.148	8.320	6.210	95.0	80.00	1 piece, (5-1/2" x 1-1/2" x 1/2")	17.0	1 cubic inch, boneless	0.0
8786	90240	SCALLOP,(BAY&SEA),CKD,STMD	70.25	111	20.54	0.84	2.97	5.41	0.0	0.00	...	0.0	0.218	0.082	0.222	41.0	85.00	3 oz	NaN	NaN	0.0
8787	90480	SYRUP,CANE	26.00	269	0.00	0.00	0.86	73.14	0.0	73.20	...	0.0	0.000	0.000	0.000	0.0	21.00	1 serving	NaN	NaN	0.0
8788	90560	SNAIL,RAW	79.20	90	16.10	1.40	1.30	2.00	0.0	0.00	...	0.1	0.361	0.259	0.252	50.0	85.00	3 oz	NaN	NaN	0.0
8789	93600	TURTLE,GREEN,RAW	78.50	89	19.80	0.50	1.20	0.00	0.0	0.00	...	0.1	0.127	0.088	0.170	50.0	85.00	3 oz	NaN	NaN	0.0

	NDB_No	Shrt_Desc	Water_(g)	Energ_Kcal	Protein_(g)	Lipid_Tot_(g)	Carbohydrt_(g)	Fiber_TD_(g)	has_Protein
0	1001	BUTTER,WITH SALT	15.87	717	0.85	81.11	0.06	0.0	True
1	1002	BUTTER,WHIPPED,W/ SALT	16.72	718	0.49	78.30	2.87	0.0	True
2	1003	BUTTER OIL,ANHYDROUS	0.24	876	0.28	99.48	0.00	0.0	True
3	1004	CHEESE,BLUE	42.41	353	21.40	28.74	2.34	0.0	True
4	1005	CHEESE,BRICK	41.11	371	23.24	29.68	2.79	0.0	True
...	...	...	...	...	...	...	...	...	...
8785	83110	MACKEREL,SALTED	43.00	305	18.50	25.10	0.00	0.0	True
8786	90240	SCALLOP,(BAY&SEA),CKD,STMD	70.25	111	20.54	0.84	5.41	0.0	True
8787	90480	SYRUP,CANE	26.00	269	0.00	0.00	73.14	0.0	False
8788	90560	SNAIL,RAW	79.20	90	16.10	1.40	2.00	0.0	True
8789	93600	TURTLE,GREEN,RAW	78.50	89	19.80	0.50	0.00	0.0	True

Introduction¶

Preprocessing¶

Summary Data Analysis¶

Correlations¶

Discussion¶

	Protein_(g)	Water_(g)
Protein_(g)	1.000000	-0.089458
Water_(g)	-0.089458	1.000000

	Protein_(g)	Water_(g)
Protein_(g)	1.000000	-0.285706
Water_(g)	-0.285706	1.000000

	Energ_Kcal	Carbohydrt_(g)
Energ_Kcal	1.000000	0.494455
Carbohydrt_(g)	0.494455	1.000000

	Energ_Kcal	Carbohydrt_(g)
Energ_Kcal	1.000000	0.373511
Carbohydrt_(g)	0.373511	1.000000