import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

##Importing file from local directory
file_path = 'projectfile.csv'
data = pd.read_csv(file_path)
data.head()

# Convert price_date to datetime
data['price_date'] = pd.to_datetime(data['price_date'])

# Dropping columns not meaningful for analysis
columns_to_drop = ['ISO3', 'country', 'geo_id','lat', 'lon', 'year', 'month', 'data_coverage', 'data_coverage_recent', 'index_confidence_score','spatially_interpolated',]
data_cleaned = data.drop(columns=columns_to_drop)

# Detect missing values
missing_values = data_cleaned.isnull().sum()

# Simple strategy: drop rows with any missing values
data_no_missing = data_cleaned.dropna()

# Outlier detection is descriptive here, using .describe()
quantitative_columns = data_cleaned.select_dtypes(include=['float64', 'int64']).columns
outlier_analysis = data_cleaned[quantitative_columns].describe()

outlier_analysis

# Counting outliers in each quantitative column
outlier_count = {}

for column in quantitative_columns:
    # Calculating the interquartile range (IQR)
    Q1 = outlier_analysis[column]['25%']
    Q3 = outlier_analysis[column]['75%']
    IQR = Q3 - Q1
    
    # Determining the upper and lower bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Counting the number of outliers
    num_outliers = len(data_cleaned[(data_cleaned[column] < lower_bound) | (data_cleaned[column] > upper_bound)])
    
    # Storing the count in the dictionary
    outlier_count[column] = num_outliers

outlier_count

{'bread': 53,
 'cassava_meal': 94,
 'cowpeas': 34,
 'gari': 56,
 'groundnuts': 40,
 'maize': 0,
 'millet': 44,
 'rice': 3,
 'sorghum': 46,
 'o_bread': 1008,
 'h_bread': 1025,
 'l_bread': 1012,
 'c_bread': 1012,
 'inflation_bread': 1971,
 'trust_bread': 2168,
 'o_cassava_meal': 688,
 'h_cassava_meal': 634,
 'l_cassava_meal': 730,
 'c_cassava_meal': 698,
 'inflation_cassava_meal': 900,
 'trust_cassava_meal': 2099,
 'o_cowpeas': 333,
 'h_cowpeas': 365,
 'l_cowpeas': 300,
 'c_cowpeas': 346,
 'inflation_cowpeas': 345,
 'trust_cowpeas': 2081,
 'o_gari': 727,
 'h_gari': 666,
 'l_gari': 763,
 'c_gari': 739,
 'inflation_gari': 878,
 'trust_gari': 2171,
 'o_groundnuts': 306,
 'h_groundnuts': 315,
 'l_groundnuts': 302,
 'c_groundnuts': 306,
 'inflation_groundnuts': 796,
 'trust_groundnuts': 2177,
 'o_maize': 40,
 'h_maize': 30,
 'l_maize': 58,
 'c_maize': 27,
 'inflation_maize': 1158,
 'trust_maize': 2177,
 'o_millet': 120,
 'h_millet': 120,
 'l_millet': 117,
 'c_millet': 116,
 'inflation_millet': 459,
 'trust_millet': 2108,
 'o_rice': 1033,
 'h_rice': 991,
 'l_rice': 1064,
 'c_rice': 1034,
 'inflation_rice': 823,
 'trust_rice': 1991,
 'o_sorghum': 77,
 'h_sorghum': 64,
 'l_sorghum': 102,
 'c_sorghum': 69,
 'inflation_sorghum': 1392,
 'trust_sorghum': 2177,
 'o_food_price_index': 720,
 'h_food_price_index': 686,
 'l_food_price_index': 714,
 'c_food_price_index': 709,
 'inflation_food_price_index': 641,
 'trust_food_price_index': 2152}

# For non-quantitative columns
non_quantitative_summary = data_cleaned.describe(include=['object', 'datetime64'])

non_quantitative_summary

# For quantitative columns
quantitative_summary = data_cleaned.describe()

quantitative_summary

# Selecting only the quantitative variables
quantitative_data = data_cleaned.select_dtypes(include=['float64', 'int64'])
##relplot of the first 15 quantitative columns
sns.relplot(data=data_cleaned.iloc[:, :15])
# Creating scatter plots for each quantitative variable
for column in quantitative_data.columns:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=column, y='o_food_price_index', data=data_cleaned)
    plt.title(f'Scatter Plot of {column} vs Overall Food price Index')
    plt.show()

# Creating boxplots for each quantitative variable
for column in quantitative_data.columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=quantitative_data[column])
    plt.title(f'Boxplot of {column}')
    plt.show()

# Select columns to analyze
selected_columns = ['cassava_meal', 'bread', 'l_sorghum', 'c_sorghum', 'o_food_price_index', 'h_food_price_index']
correlation_matrix = data_cleaned[selected_columns].corr(method='spearman')


# Plotting
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Analysis of Selected Quantitative Columns')
plt.show()

	ISO3	country	adm1_name	adm2_name	mkt_name	lat	lon	geo_id	price_date	year	...	l_sorghum	c_sorghum	inflation_sorghum	trust_sorghum	o_food_price_index	h_food_price_index	l_food_price_index	c_food_price_index	inflation_food_price_index	trust_food_price_index
0	NGA	Nigeria	Abia	Oboma Ngwa	Aba	5.14764	7.35672	gid_051476073567	2007-01-01	2007	...	3234.85	3473.81	NaN	7.6	0.59	0.59	0.57	0.59	NaN	9.2
1	NGA	Nigeria	Abia	Oboma Ngwa	Aba	5.14764	7.35672	gid_051476073567	2007-02-01	2007	...	2690.75	2690.75	NaN	7.6	0.58	0.60	0.57	0.57	NaN	9.2
2	NGA	Nigeria	Abia	Oboma Ngwa	Aba	5.14764	7.35672	gid_051476073567	2007-03-01	2007	...	2250.03	2656.49	NaN	7.6	0.56	0.57	0.55	0.56	NaN	9.2
3	NGA	Nigeria	Abia	Oboma Ngwa	Aba	5.14764	7.35672	gid_051476073567	2007-04-01	2007	...	2380.63	2501.11	NaN	7.6	0.56	0.57	0.55	0.56	NaN	9.2
4	NGA	Nigeria	Abia	Oboma Ngwa	Aba	5.14764	7.35672	gid_051476073567	2007-05-01	2007	...	2327.24	2498.09	NaN	7.6	0.56	0.57	0.55	0.56	NaN	9.2

	bread	cassava_meal	cowpeas	gari	groundnuts	maize	millet	rice	sorghum	o_bread	...	l_sorghum	c_sorghum	inflation_sorghum	trust_sorghum	o_food_price_index	h_food_price_index	l_food_price_index	c_food_price_index	inflation_food_price_index	trust_food_price_index
count	1332.000000	1231.000000	1487.000000	1428.000000	1446.000000	1692.000000	1591.000000	1427.000000	1669.000000	10971.000000	...	10971.000000	10971.000000	10335.000000	10971.000000	10971.000000	10971.000000	10971.000000	10971.000000	10335.000000	10971.000000
mean	257.409827	18746.527774	26093.394210	15887.914671	32816.223672	12214.501531	14456.949485	18712.062957	13537.006477	249.271163	...	9544.469711	10108.030084	22.041332	8.000055	0.982352	1.002703	0.962405	0.982617	8.668432	9.338684
std	113.073320	10216.739859	14093.784599	8274.731001	15937.954470	6189.131596	8260.859959	8122.121105	8660.917417	50.637673	...	6285.769032	6571.308763	54.477319	0.846585	0.370157	0.378391	0.362189	0.371007	19.617331	0.291878
min	96.530000	4400.000000	7472.000000	4000.000000	7646.000000	2350.000000	2235.000000	6600.000000	2295.000000	95.920000	...	1879.040000	2033.390000	-56.390000	7.600000	0.490000	0.500000	0.480000	0.500000	-34.200000	9.200000
25%	180.000000	11400.000000	14612.500000	9274.250000	21600.000000	7000.000000	8150.000000	13131.000000	7000.000000	228.425000	...	5172.030000	5520.775000	-10.315000	7.600000	0.730000	0.740000	0.710000	0.730000	-4.255000	9.200000
50%	247.520000	16240.000000	22483.870000	14577.420000	26612.500000	10425.000000	12777.500000	16500.000000	11550.000000	244.380000	...	6530.150000	6835.050000	3.680000	7.600000	0.840000	0.850000	0.820000	0.840000	4.380000	9.200000
75%	307.690000	22266.000000	34729.305000	20000.000000	40900.000000	17245.890000	19280.335000	25208.335000	18000.000000	263.895000	...	13788.490000	14868.200000	32.205000	7.600000	1.120000	1.150000	1.090000	1.125000	16.750000	9.200000
max	711.390000	58000.000000	74150.000000	44916.130000	96774.190000	27641.940000	48130.670000	44800.000000	51482.670000	703.000000	...	60108.680000	63171.690000	376.000000	10.000000	3.000000	3.050000	2.820000	2.920000	111.010000	10.000000

	adm1_name	adm2_name	mkt_name	price_date	currency	components	start_dense_data	last_survey_point
count	10971	10971	10971	10971	10971	10971	10971	10971
unique	14	35	53	NaN	1	1	1	1
top	Borno	Konduga	Aba	NaN	NGN	bread (1 Unit, Index Weight = 1), cassava_meal...	Jan 2007	Jan 2023
freq	4968	1656	207	NaN	10971	10971	10971	10971
mean	NaN	NaN	NaN	2015-08-01 11:21:44.347826176	NaN	NaN	NaN	NaN
min	NaN	NaN	NaN	2007-01-01 00:00:00	NaN	NaN	NaN	NaN
25%	NaN	NaN	NaN	2011-04-01 00:00:00	NaN	NaN	NaN	NaN
50%	NaN	NaN	NaN	2015-08-01 00:00:00	NaN	NaN	NaN	NaN
75%	NaN	NaN	NaN	2019-12-01 00:00:00	NaN	NaN	NaN	NaN
max	NaN	NaN	NaN	2024-03-01 00:00:00	NaN	NaN	NaN	NaN

	price_date	bread	cassava_meal	cowpeas	gari	groundnuts	maize	millet	rice	sorghum	...	l_sorghum	c_sorghum	inflation_sorghum	trust_sorghum	o_food_price_index	h_food_price_index	l_food_price_index	c_food_price_index	inflation_food_price_index	trust_food_price_index
count	10971	1332.000000	1231.000000	1487.000000	1428.000000	1446.000000	1692.000000	1591.000000	1427.000000	1669.000000	...	10971.000000	10971.000000	10335.000000	10971.000000	10971.000000	10971.000000	10971.000000	10971.000000	10335.000000	10971.000000
mean	2015-08-01 11:21:44.347826176	257.409827	18746.527774	26093.394210	15887.914671	32816.223672	12214.501531	14456.949485	18712.062957	13537.006477	...	9544.469711	10108.030084	22.041332	8.000055	0.982352	1.002703	0.962405	0.982617	8.668432	9.338684
min	2007-01-01 00:00:00	96.530000	4400.000000	7472.000000	4000.000000	7646.000000	2350.000000	2235.000000	6600.000000	2295.000000	...	1879.040000	2033.390000	-56.390000	7.600000	0.490000	0.500000	0.480000	0.500000	-34.200000	9.200000
25%	2011-04-01 00:00:00	180.000000	11400.000000	14612.500000	9274.250000	21600.000000	7000.000000	8150.000000	13131.000000	7000.000000	...	5172.030000	5520.775000	-10.315000	7.600000	0.730000	0.740000	0.710000	0.730000	-4.255000	9.200000
50%	2015-08-01 00:00:00	247.520000	16240.000000	22483.870000	14577.420000	26612.500000	10425.000000	12777.500000	16500.000000	11550.000000	...	6530.150000	6835.050000	3.680000	7.600000	0.840000	0.850000	0.820000	0.840000	4.380000	9.200000
75%	2019-12-01 00:00:00	307.690000	22266.000000	34729.305000	20000.000000	40900.000000	17245.890000	19280.335000	25208.335000	18000.000000	...	13788.490000	14868.200000	32.205000	7.600000	1.120000	1.150000	1.090000	1.125000	16.750000	9.200000
max	2024-03-01 00:00:00	711.390000	58000.000000	74150.000000	44916.130000	96774.190000	27641.940000	48130.670000	44800.000000	51482.670000	...	60108.680000	63171.690000	376.000000	10.000000	3.000000	3.050000	2.820000	2.920000	111.010000	10.000000
std	NaN	113.073320	10216.739859	14093.784599	8274.731001	15937.954470	6189.131596	8260.859959	8122.121105	8660.917417	...	6285.769032	6571.308763	54.477319	0.846585	0.370157	0.378391	0.362189	0.371007	19.617331	0.291878

1.¶

2.¶

3.¶

4.¶

5.¶

6.¶

7.¶

8.¶