import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_24214/2660915427.py:2: DeprecationWarning: 
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd

# Read the contents of the original file, convert from .csv to .txt before executing this line
with open('district_Reported_Offenders_Number.txt', 'r') as file:
    lines = file.readlines()

# Insert numbers at the beginning of each line starting and remove last value from the second line onwards of every line of the .txt file
# This fixes an error in the data set where the final quantitative value is set in line with a linear number line
for i in range(1, len(lines)):
    lines[i] = f"{i},{','.join(lines[i].split(',')[:-1])}\n"

# Write the modified content back to the file, the file is done and needs to be saved before being converted to .csv
with open('modified_file.txt', 'w') as file:
    file.writelines(lines)

# Read the .txt file into a DataFrame
df = pd.read_csv('modified_file.txt')

# Write the DataFrame to a .csv file
df.to_csv('district_crimes.csv', index=False)

crimes = pd.read_csv("district_crimes.csv")

print(crimes.shape)
cM, cN = crimes.shape

crimes.head()

(25020, 93)

# this only for drug crime data
columns_to_keep1 = ['District', 'Month Year', 'Age', 'Sex', 'Drug Offences', 'Trafficking Drugs', 'Possess Drugs', 'Produce Drugs', 'Sell Supply Drugs', 'Other Drug Offences']

# Create a new DataFrame 'drug_crimes' with only the specified columns
drug_crimes = crimes[columns_to_keep1].copy()


# Filter the DataFrame to include only rows where at least one of the specified columns has a value greater than 0
drug_crimes= drug_crimes[(drug_crimes['Drug Offences'] > 0)] 
# drug offences is just a total of all other drug offences, if it 0 then the individual has commited no crimes related to drugs

#print(drug_crimes.columns)

print(drug_crimes.shape)

# get the bounds of the new dataset
dcN, dcM = drug_crimes.shape

drug_crimes.head()

(16531, 10)

for columns in drug_crimes:
    print(columns + ": "+ str(drug_crimes[columns].dtype))

District: object
Month Year: object
Age: object
Sex: object
Drug Offences: int64
Trafficking Drugs: int64
Possess Drugs: int64
Produce Drugs: int64
Sell Supply Drugs: int64
Other Drug Offences: int64

# Check for missing values in each column
missing_values = drug_crimes.isnull().any()
print("Is there any missing values in all columns: ")
print(missing_values)
print("")

#print(drug_crimes.iloc[:, 4:10])

#check for negative values in any of the columns
negative_values = (drug_crimes.iloc[:, 4:10] < 0).any()
print("Is there any negative values in the quanitative columns: ")
print(negative_values)
print("")

Is there any missing values in all columns: 
District               False
Month Year             False
Age                    False
Sex                    False
Drug Offences          False
Trafficking Drugs      False
Possess Drugs          False
Produce Drugs          False
Sell Supply Drugs      False
Other Drug Offences    False
dtype: bool

Is there any negative values in the quanitative columns: 
Drug Offences          False
Trafficking Drugs      False
Possess Drugs          False
Produce Drugs          False
Sell Supply Drugs      False
Other Drug Offences    False
dtype: bool

unique_counts = [] # store each categorical type 

# run a for loop through all of the categorical data
for column in drug_crimes[['District', 'Month Year', 'Age', 'Sex']]:
    unique_values = drug_crimes[column].nunique()
    unique_counts.append(unique_values)

# print out all of the categorical data 
print("Number of unique values for each column:")
for i, column in enumerate(drug_crimes[['District', 'Month Year', 'Age', 'Sex']].columns):
    print(f"{column}: {unique_counts[i]}")

Number of unique values for each column:
District: 15
Month Year: 278
Age: 2
Sex: 3

count_not_stated = (drug_crimes.iloc[:, 3] == "Not Stated").sum()
print("Number of times 'Not Stated' appears in the column:", count_not_stated)

Number of times 'Not Stated' appears in the column: 282

drug_crimes = drug_crimes[drug_crimes["Sex"] != "Not Stated"]

fN, fM = drug_crimes.shape

difference = dcN - fN

#print(difference) difference is 282

# Assuming 'Month Year' is a string column in the 'drug_crimes' DataFrame
drug_crimes['Year'] =  drug_crimes['Month Year'].str[-2:]

# Drop the 'Month Year' column if you no longer need it
drug_crimes.drop('Month Year', axis=1, inplace=True)

unique_counts = [] # store each categorical type 

# run a for loop through all of the categorical data
for column in drug_crimes[['District', 'Year', 'Age', 'Sex']]:
    unique_values = drug_crimes[column].nunique()
    unique_counts.append(unique_values)

# print out all of the categorical data 
print("Number of unique values for each column:")
for i, column in enumerate(drug_crimes[['District', 'Year', 'Age', 'Sex']].columns):
    print(f"{column}: {unique_counts[i]}")

drug_crimes.head()

Number of unique values for each column:
District: 15
Year: 24
Age: 2
Sex: 2

# Select quantitative columns
quantitative_columns = drug_crimes[['Drug Offences', 'Trafficking Drugs', 'Possess Drugs', 'Produce Drugs', 'Sell Supply Drugs', 'Other Drug Offences']]

# Calculate z-scores for each data point in the selected columns
z_scores = quantitative_columns.apply(zscore)

# Define a threshold for identifying outliers (e.g., z-score > 3 or < -3)
outliers = (z_scores > 3) | (z_scores < -3)

# Print the count of outliers for each column
print("Out of " + str(len(drug_crimes)) + " total recordings,")
print("The number of outliers in each quantitative column is listed below:")
print(outliers.sum())

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[13], line 5
      2 quantitative_columns = drug_crimes[['Drug Offences', 'Trafficking Drugs', 'Possess Drugs', 'Produce Drugs', 'Sell Supply Drugs', 'Other Drug Offences']]
      4 # Calculate z-scores for each data point in the selected columns
----> 5 z_scores = quantitative_columns.apply(zscore)
      7 # Define a threshold for identifying outliers (e.g., z-score > 3 or < -3)
      8 outliers = (z_scores > 3) | (z_scores < -3)

NameError: name 'zscore' is not defined

summary = drug_crimes.describe()
print(summary)

       Drug Offences  Trafficking Drugs  Possess Drugs  Produce Drugs  \
count   16249.000000       16249.000000   16249.000000   16249.000000   
mean       75.867007           0.500892      35.193243       2.302849   
std       102.185848           1.357915      49.305527       3.848647   
min         1.000000           0.000000       0.000000       0.000000   
25%        10.000000           0.000000       5.000000       0.000000   
50%        32.000000           0.000000      15.000000       1.000000   
75%       103.000000           0.000000      46.000000       3.000000   
max       886.000000          24.000000     457.000000      33.000000   

       Sell Supply Drugs  Other Drug Offences  
count       16249.000000         16249.000000  
mean            2.376700            35.493323  
std             4.825648            46.833235  
min             0.000000             0.000000  
25%             0.000000             5.000000  
50%             1.000000            15.000000  
75%             3.000000            50.000000  
max           185.000000           401.000000

# Select columns for correlation analysis
cols = ['Trafficking Drugs', 'Possess Drugs', 'Produce Drugs', 'Sell Supply Drugs', 'Other Drug Offences']
corr = drug_crimes[cols].corr()

# Create a heatmap
sns.heatmap(corr, annot=True, cmap='plasma')
plt.title('Correlation Matrix')
plt.show()

# Create a figure with 5 subplots
fig, axs = plt.subplots(3, 2, figsize=(20, 15))

# Function to calculate and display correlation coefficient
def annotate_correlation(x, y, ax):
    correlation = np.corrcoef(x, y)[0, 1]
    ax.annotate(f'Correlation: {correlation:.2f}', xy=(0.5, 0.9), xycoords='axes fraction', ha='center')

# Scatter plot 1: Other Drug Offences vs Possess Drugs
ax = axs[0, 0]
ax.scatter(drug_crimes['Other Drug Offences'], drug_crimes['Possess Drugs'])
ax.set_title('Other Drug Offences vs Possess Drugs')
ax.set_xlabel('Other Drug Offences')
ax.set_ylabel('Possess Drugs')
annotate_correlation(drug_crimes['Other Drug Offences'], drug_crimes['Possess Drugs'], ax)

# Scatter plot 2: Possess Drugs vs Sell Supply Drugs
ax = axs[0, 1]
ax.scatter(drug_crimes['Possess Drugs'], drug_crimes['Sell Supply Drugs'])
ax.set_title('Possess Drugs vs Sell Supply Drugs')
ax.set_xlabel('Possess Drugs')
ax.set_ylabel('Sell Supply Drugs')
annotate_correlation(drug_crimes['Possess Drugs'], drug_crimes['Sell Supply Drugs'], ax)

# Scatter plot 3: Possess Drugs vs Produce Drugs
ax = axs[1, 0]
ax.scatter(drug_crimes['Possess Drugs'], drug_crimes['Produce Drugs'])
ax.set_title('Possess Drugs vs Produce Drugs')
ax.set_xlabel('Possess Drugs')
ax.set_ylabel('Produce Drugs')
annotate_correlation(drug_crimes['Possess Drugs'], drug_crimes['Produce Drugs'], ax)

# Scatter plot 4: Other Drug Offences vs Produce Drugs
ax = axs[1, 1]
ax.scatter(drug_crimes['Other Drug Offences'], drug_crimes['Produce Drugs'])
ax.set_title('Other Drug Offences vs Produce Drugs')
ax.set_xlabel('Other Drug Offences')
ax.set_ylabel('Produce Drugs')
annotate_correlation(drug_crimes['Other Drug Offences'], drug_crimes['Produce Drugs'], ax)

# Scatter plot 5: Other Drug Offences vs Sell Supply Drugs
ax = axs[2, 0]
ax.scatter(drug_crimes['Other Drug Offences'], drug_crimes['Sell Supply Drugs'])
ax.set_title('Other Drug Offences vs Sell Supply Drugs')
ax.set_xlabel('Other Drug Offences')
ax.set_ylabel('Sell Supply Drugs')
annotate_correlation(drug_crimes['Other Drug Offences'], drug_crimes['Sell Supply Drugs'], ax)

# Adjust layout
plt.tight_layout()

# Show the plots
plt.show()

# Specify the quantitative columns, excluded the 'Drug Offences' category because it is just a total of the other 5
quantitative_columns = ['Trafficking Drugs', 'Possess Drugs', 'Produce Drugs', 'Sell Supply Drugs', 'Other Drug Offences']

# Group by 'Age' and sum the total number of offences for each quantitative column
# A lambda function is used to tell if an offence was commited, not how many total offence each indiviudal has commmited
offences_by_age = drug_crimes.groupby('Age')[quantitative_columns].apply(lambda x: (x != 0).sum())

# Plotting the bar graph
ax = offences_by_age.plot(kind='bar')
plt.title('Distribution of Total Number of Offences by Age and Offence Type (From JAN01-FEB24)')
plt.xlabel('Age')
plt.ylabel('Total Number of Offences')
plt.xticks(rotation=0)

# Move the legend to the upper right corner
plt.legend(title='Offence Type', bbox_to_anchor=(1.02, 1), loc='upper left')

plt.show()

# Assuming 'drug_crimes' DataFrame is already loaded with relevant data

# Specify the quantitative columns
quantitative_columns = [ 'Trafficking Drugs', 
    'Possess Drugs', 'Produce Drugs', 
    'Sell Supply Drugs', 'Other Drug Offences'
]

# Group by 'Age' and sum the total number of offences for each quantitative column
offences_by_age = drug_crimes.groupby('Sex')[quantitative_columns].apply(lambda x: (x != 0).sum())


# Plotting the bar graph
offences_by_age.plot(kind='bar')
plt.title('Distribution of Total Number of Offences by Sex and Offence Type (From JAN01-FEB24)')
plt.xlabel('Sex')
plt.ylabel('Total Number of Offences')
plt.xticks(rotation=0)

# Move the legend to the upper right corner
plt.legend(title='Offence Type', bbox_to_anchor=(1.02, 1), loc='upper left')

plt.show()

import matplotlib.pyplot as plt

# Define the quantitative columns
quantitative_columns = ['Trafficking Drugs', 'Possess Drugs', 'Produce Drugs', 'Sell Supply Drugs']

# Group by 'Age' and 'Sex', and count the number of non-zero entries for each quantitative column
offences_by_age_sex = drug_crimes.groupby(['Sex', 'Age'])[quantitative_columns].apply(lambda x: (x != 0).sum())

# Plot the bar chart for 'Age' and 'Sex'
offences_by_age_sex.unstack().plot(kind='bar', figsize=(12, 8), width=0.8)
plt.title('Number of Drug Offences Divided by Sex and Compared Between Age Groups (From JAN01-FEB24)')
plt.xlabel('Age Group')
plt.ylabel('Number of Drug Offences (From JAN01-FEB24)')
plt.xlabel('Sex')
plt.xticks(rotation=0)
# Move the legend to the upper right corner
plt.legend(title='Offence Type', bbox_to_anchor=(1.02, 1), loc='upper left')
plt.show()

	District	Month Year	Age	Sex	...	Fare Evasion	Public Nuisance	Traffic and Related Offences	Dangerous Operation of a Vehicle	Drink Driving	Disqualified Driving	Interfere with Mechanism of Motor Vehicle	Miscellaneous Offences	Other Offences
0	Capricornia	JAN01	Adult	Female	...	0	16	11	0	9	2	0	0	70
1	Capricornia	JAN01	Adult	Male	...	2	70	126	6	98	21	1	1	415
2	Capricornia	JAN01	Adult	Not Stated	...	0	0	0	0	0	0	0	0	0
3	Capricornia	JAN01	Juvenile	Female	...	0	1	0	0	0	0	0	3	11
4	Capricornia	JAN01	Juvenile	Male	...	0	12	0	0	0	0	0	1	41