#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
plt.rcParams['figure.figsize'] = [10, 5]

/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_93493/2858617164.py:2: DeprecationWarning: 
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd

#Reading in the data from the NYC Open Data API
#Note: the $limit parameter is set to 100,000 to avoid performance and memory issues
crashes = pd.read_csv('https://data.cityofnewyork.us/resource/h9gi-nx95.csv?$limit=100000')

display(crashes.head(5))

#Printing the column names
display(pd.DataFrame(crashes.columns).transpose())

#Deleting GPS location and collision_id columns
crashes.drop(['latitude', 'longitude', 'location', 'collision_id', 'zip_code', 'cross_street_name', 'off_street_name'], axis=1, inplace=True)

cfs = crashes[['contributing_factor_vehicle_1', 'contributing_factor_vehicle_2', 'contributing_factor_vehicle_3', 'contributing_factor_vehicle_4', 'contributing_factor_vehicle_5']]
vtc = crashes[['vehicle_type_code1', 'vehicle_type_code2', 'vehicle_type_code_3', 'vehicle_type_code_4', 'vehicle_type_code_5']]

#Counting the number of crashes that have a specified contributing factor and vehicle code for each #
cfsUNDEFINED = (cfs == 'Unspecified')
cfsNANs = cfs.isna()
cfsNullCounts = (cfsUNDEFINED | cfsNANs).sum()
cfsDataCounts = cfs.shape[0] - cfsNullCounts
cfsDataPct = cfsDataCounts / cfs.shape[0] * 100

vtcNANs = vtc.isna()
vtcNullCounts = vtcNANs.sum()
vtcDataCounts = vtc.shape[0] - vtcNullCounts
vtcDataPct = vtcDataCounts / vtc.shape[0] * 100

DataPct = pd.concat([cfsDataPct, vtcDataPct], axis=1)

#Plotting the percentage of crashes with a specified contributing factor for each #
DataPct.index = ['Factor 1', 'Factor 2', 'Factor 3', 'Factor 4', 'Factor 5', 'Vehicle 1', 'Vehicle 2', 'Vehicle 3', 'Vehicle 4', 'Vehicle 5']
DataPct.plot(kind='bar')
plt.title('Percentage of Crashes with a Specified Contributing Factor/Vehicle Type')
plt.xlabel('Factor/Vehicle #')
plt.ylabel('Percentage')
plt.xticks(rotation=0)
plt.legend(['Contributing Factor', 'Vehicle Type'], loc='upper left')
plt.gca().set_yticks(np.arange(0, 101, 10))
plt.gca().set_yticklabels(['{:.0f}%'.format(x) for x in plt.gca().get_yticks()])
for i, v in enumerate(DataPct[0][0:5]):
    plt.text(i-0.35, v+2, '{:.1f}%'.format(v))
for i, v in enumerate(DataPct[1][5:10]):
    plt.text(i+4.9, v+1.5, '{:.1f}%'.format(v))

plt.show()

crashes.drop(['contributing_factor_vehicle_4', 'contributing_factor_vehicle_5'], axis=1, inplace=True)

crashes = crashes.fillna('None')
crashes = crashes.replace('Unspecified', 'None')

def toTime(x):
    #converts time from minutes to HH:MM AM/PM format
    return (str(12) if np.floor(x/60)%12 == 0 else str(int(x / 60)%12)) + ':' + str(int(x % 60)).zfill(2) + ' ' + ('AM' if x < 720 else 'PM')

crashes['crash_date'] = pd.to_datetime(crashes['crash_date'])
crashes['year'] = crashes['crash_date'].dt.year
crashes['day'] = crashes['crash_date'].dt.day_name()
crashes['month'] = crashes['crash_date'].dt.month
crashes['month'] = crashes['month'].replace({1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'})
crashes.sort_values(by='crash_date', inplace=True)
crashes.reset_index(drop=True, inplace=True)
crashes['crash_time'] = pd.to_datetime(crashes['crash_time'])
crashes['crash_time'] = crashes['crash_time'].dt.hour * 60 + crashes['crash_time'].dt.minute
crashes['timeofday'] = pd.cut(crashes['crash_time'], bins=[-1, 360, 720, 1080, 1440], labels=['Night', 'Morning', 'Afternoon', 'Evening'])

display(crashes.head(3))

/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_93493/1185858539.py:12: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  crashes['crash_time'] = pd.to_datetime(crashes['crash_time'])

crashes['vehicles_involved'] = crashes[['vehicle_type_code1', 'vehicle_type_code2', 'vehicle_type_code_3', 'vehicle_type_code_4', 'vehicle_type_code_5']].apply(lambda x: 5-x.str.contains('None').sum(), axis=1)

crashes.rename(
    columns={
        'crash_date': 'date',
        'crash_time': 'time',
        'on_street_name': 'street',
        'number_of_persons_injured': 'injured',
        'number_of_persons_killed': 'killed',
        'number_of_pedestrians_injured': 'pedestrians_injured',
        'number_of_pedestrians_killed': 'pedestrians_killed',
        'number_of_cyclist_injured': 'cyclists_injured',
        'number_of_cyclist_killed': 'cyclists_killed',
        'number_of_motorist_injured': 'motorists_injured',
        'number_of_motorist_killed': 'motorists_killed',
        'contributing_factor_vehicle_1': 'factor1',
        'contributing_factor_vehicle_2': 'factor2',
        'contributing_factor_vehicle_3': 'factor3',
        'vehicle_type_code1': 'vehicle1',
        'vehicle_type_code2': 'vehicle2',
        'vehicle_type_code_3': 'vehicle3',
        'vehicle_type_code_4': 'vehicle4',
        'vehicle_type_code_5': 'vehicle5'

    },
    inplace=True
)
cols = ['date', 'year', 'month','day', 'time', 'timeofday', 'borough', 'street', 'injured', 'killed', 'pedestrians_injured', 'pedestrians_killed', 'cyclists_injured', 'cyclists_killed', 'motorists_injured', 'motorists_killed', 'factor1', 'factor2', 'factor3', 'vehicle1', 'vehicle2', 'vehicle3', 'vehicle4', 'vehicle5', 'vehicles_involved']
crashes = crashes[cols]
display(pd.DataFrame(crashes.columns).transpose())

#Outlier Analysis
#Creating a boxplot for the number of injured persons
plt.figure(figsize=(10, 5))
quantitative = crashes[['injured', 'killed', 'pedestrians_injured', 'pedestrians_killed', 'cyclists_injured', 'cyclists_killed', 'motorists_injured', 'motorists_killed']]
numOutliers = (quantitative > quantitative.quantile(0.75) + 1.5*(quantitative.quantile(0.75) - quantitative.quantile(0.25))).sum()
sns.boxplot(quantitative, orient='h', palette='Set2')
for i, v in enumerate(quantitative.mean()):
    plt.text(0.1, i, '{:.2f}'.format(v), va='center')
for i, v in enumerate(numOutliers):
    plt.text(19.8, i, v, ha='right', va='center')
plt.title('Boxplot of the Number of Injured or Killed Persons')
plt.xlabel('Number of Injured Persons')
plt.xticks(np.arange(0, 21, 5))
plt.show()

/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/categorical.py:640: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  positions = grouped.grouper.result_index.to_numpy(dtype=float)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/categorical.py:640: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  positions = grouped.grouper.result_index.to_numpy(dtype=float)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/categorical.py:640: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  positions = grouped.grouper.result_index.to_numpy(dtype=float)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/categorical.py:640: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  positions = grouped.grouper.result_index.to_numpy(dtype=float)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/categorical.py:640: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  positions = grouped.grouper.result_index.to_numpy(dtype=float)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/categorical.py:640: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  positions = grouped.grouper.result_index.to_numpy(dtype=float)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/categorical.py:640: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  positions = grouped.grouper.result_index.to_numpy(dtype=float)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/categorical.py:640: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  positions = grouped.grouper.result_index.to_numpy(dtype=float)

#vehicles involved outliers
plt.figure(figsize=(10, 1))
vehicles = crashes['vehicles_involved']
vehiclesOutliers = (vehicles > vehicles.quantile(0.75) + 1.5*(vehicles.quantile(0.75) - vehicles.quantile(0.25))).sum()
sns.boxplot(vehicles, orient='h', palette='Set2')
plt.text(0.1, -0.15, 'μ = {:.2f}'.format(vehicles.mean()), va='center')
plt.text(5.8, 0, vehiclesOutliers, ha='right', va='center')
plt.title('Boxplot of the Number of Vehicles Involved')
plt.xlabel('Number of Vehicles Involved')
plt.xticks(np.arange(0, 7, 1))
plt.show()

#Creating a bar graph for num vehicles
vehiclesCounts = vehicles.value_counts()
vehiclesCounts = vehiclesCounts.sort_index()
vehiclesCounts.plot(kind='bar', edgecolor='black')
plt.title('Number of Vehicles Involved in Crashes')
plt.xlabel('Number of Vehicles')
plt.ylabel('Number of Crashes')
plt.xticks(rotation=0)
for i, v in enumerate(vehiclesCounts):
    plt.text(i-0.16, v+400, v)
plt.show()

/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_93493/3375989920.py:5: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(vehicles, orient='h', palette='Set2')
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/categorical.py:640: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  positions = grouped.grouper.result_index.to_numpy(dtype=float)

display(crashes.head(5))

#Creating a new dataframe that contains the total number of crashes for each year
crash_years = pd.DataFrame(crashes['year'].value_counts().sort_index()).reset_index()
crash_years.columns = ['Year', 'Crashes']
crach_pct= crashes['year'].value_counts(normalize=True).sort_index()
crash_years.plot(kind='bar', x='Year', y='Crashes')
plt.title('Total Number of Crashes by Year')
plt.xlabel('Year')
plt.ylabel('Number of Crashes')
plt.xticks(rotation=0)
for i, v in enumerate(crash_years['Crashes']):
    plt.text(i-0.22, v+500, v)
for i, v in enumerate(crach_pct):
    plt.text(i-0.24, 5000 , '{:.1f}%'.format(v*100))
plt.show()

crash_months = pd.DataFrame(crashes['month'].value_counts())
crash_months.columns = ['Crashes']
crash_months = crash_months.reindex(['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'])
crash_months.plot(kind='bar', color='orange', edgecolor='black')
plt.title('Total Number of Crashes by Month')
plt.xlabel('Month')
plt.ylabel('Number of Crashes')
plt.xticks(rotation=45)
plt.axhline(crash_months['Crashes'].mean(), color='red', linestyle='dashdot', label='Average')
plt.text(11.6, crash_months['Crashes'].mean(), 'Average: {:.0f}'.format(crash_months['Crashes'].mean()))
plt.legend(loc = 'lower right')
for i, v in enumerate(crash_months['Crashes']):
    plt.text(i-0.25, v+100, v)

plt.show()

crash_days = pd.DataFrame(crashes['day'].value_counts())
crash_days.columns = ['Crashes']
crash_days = crash_days.reindex(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
crash_days.plot(kind='bar', color='green', edgecolor='black')
plt.title('Total Number of Crashes by Day')
plt.xlabel('Day')
plt.ylabel('Number of Crashes')
plt.xticks(rotation=0)
plt.axhline(crash_days['Crashes'].mean(), color='lightgreen', linestyle='dashdot', label='Average')
plt.text(6.6, crash_days['Crashes'].mean(), 'Average: {:.0f}'.format(crash_days['Crashes'].mean()))
plt.legend( loc='lower right')
for i, v in enumerate(crash_days['Crashes']):
    plt.text(i-0.19, v+150, v)

plt.show()

#line graph crashes by time
plt.figure(figsize=(10, 5))
crashes['time'].plot(kind='hist', bins=48, edgecolor='black')

#least squares 5th-degree polynomial fit
x = np.arange(0, 1440, 30)
y = crashes['time'].value_counts(bins = 48).sort_index()
coeffs = np.polyfit(x, y, 5)
poly = np.poly1d(coeffs)
plt.plot(x, poly(x), color='red', linestyle='solid', label='Least Squares Fit', linewidth=2)

plt.title('Total Number of Crashes by Time')
plt.xlabel('Time')
plt.ylabel('Number of Crashes')
plt.xticks(rotation=20)
plt.gca().set_xticks(np.arange(0, 1441, 120))
plt.gca().set_xticklabels([toTime(x) for x in plt.gca().get_xticks()])
plt.legend(loc='upper right')
plt.show()

datecounts = crashes['date'].value_counts()
average = datecounts.mean()
maxdate = datecounts.idxmax()
holidaycounts = pd.DataFrame(datecounts[[ '2021-12-25', '2021-12-24', '2021-10-31', '2021-12-31', '2021-07-04', str(maxdate)]])
holidaycounts.columns = ['Crashes']
holidaycounts.index = ['Christmas', 'Christmas Eve', 'Halloween', 'New Year\'s Eve', '4th of July', 'Most Crashes']
holidaycounts.plot(kind='bar', color='purple', edgecolor='black')
plt.title('Total Number of Crashes on Holidays')
plt.xlabel('Date')
plt.ylabel('Number of Crashes')
plt.xticks(rotation=45)
plt.axhline(average, color='red', linestyle='dashdot', label='Average')
plt.text(5.6, average, 'Average: {:.0f}'.format(average))
plt.legend(loc='upper left')
for i, v in enumerate(holidaycounts['Crashes']):
    plt.text(i-0.1, v+4, v)
plt.show()

boroughs = pd.DataFrame(crashes['borough'].value_counts())
boroughs.columns = ['Crashes']
boroughs.plot(kind='bar', color='pink', edgecolor='black')
boroughpct = crashes['borough'].value_counts(normalize=True)
plt.title('Total Number of Crashes by Borough')
plt.xlabel('Borough')
plt.ylabel('Number of Crashes')
plt.xticks(rotation=45)
plt.axhline(boroughs['Crashes'].mean(), color='red', linestyle='dashdot', label='Average')
plt.text(4.6, boroughs['Crashes'].mean()+500, 'Average: {:.0f}'.format(boroughs['Crashes'].mean()))
plt.legend(loc='upper right')
for i, v in enumerate(boroughs['Crashes']):
    plt.text(i-0.15, v+100, v)

for i, v in enumerate(boroughpct):
    plt.text(i-0.15, 600, '{:.1f}%'.format(v*100))
plt.show()

streetcounts = crashes['street'].value_counts()

numStreets = streetcounts.shape[0]


print('Number of crashes with no specified street: {}'.format(streetcounts['None']))
print('Total number of unique streets: {}'.format(numStreets))

lowlim = [10, 50, 100, 200, 500, 750, 1000]

streetseries = pd.Series([streetcounts[streetcounts > x].shape[0]-1 for x in lowlim], index=['More than 10', 'More than 50', 'More than 100', 'More than 200', 'More than 500', 'More than 750', 'More than 1000'])
streetseries['Less than 10'] = streetcounts[streetcounts <= 10].shape[0]
streetseries = streetseries[['Less than 10','More than 10', 'More than 50', 'More than 100', 'More than 200', 'More than 500', 'More than 750', 'More than 1000']]
streetseries.plot(kind='bar', color='cyan', edgecolor='black')
streetpct = streetseries / numStreets * 100
plt.title('Number of Streets that have a Specified Number of Crashes')
plt.xlabel('Number of Crashes')
plt.ylabel('Number of Streets')
plt.xticks(rotation=45)
for i, v in enumerate(streetseries):
    plt.text(i-0.2, v+20, v)
for i, v in enumerate(streetpct):
    plt.text(i-0.22, 500, '{:.1f}%'.format(v))
plt.show()

Number of crashes with no specified street: 27081
Total number of unique streets: 4359

#Outlier Analysis
#Creating a boxplot for the number of injured persons
plt.figure(figsize=(10, 5))
quantitative = crashes[['injured', 'killed', 'pedestrians_injured', 'pedestrians_killed', 'cyclists_injured', 'cyclists_killed', 'motorists_injured', 'motorists_killed']]
numOutliers = (quantitative > quantitative.quantile(0.75) + 1.5*(quantitative.quantile(0.75) - quantitative.quantile(0.25))).sum()
sns.boxplot(quantitative, orient='h', palette='Set2')
for i, v in enumerate(quantitative.mean()):
    plt.text(0.1, i, '{:.2f}'.format(v), va='center')
for i, v in enumerate(numOutliers):
    plt.text(19.8, i, v, ha='right', va='center')
plt.title('Boxplot of the Number of Injured or Killed Persons')
plt.xlabel('Number of Injured Persons')
plt.xticks(np.arange(0, 21, 5))
plt.show()

/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/categorical.py:640: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  positions = grouped.grouper.result_index.to_numpy(dtype=float)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/categorical.py:640: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  positions = grouped.grouper.result_index.to_numpy(dtype=float)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/categorical.py:640: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  positions = grouped.grouper.result_index.to_numpy(dtype=float)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/categorical.py:640: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  positions = grouped.grouper.result_index.to_numpy(dtype=float)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/categorical.py:640: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  positions = grouped.grouper.result_index.to_numpy(dtype=float)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/categorical.py:640: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  positions = grouped.grouper.result_index.to_numpy(dtype=float)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/categorical.py:640: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  positions = grouped.grouper.result_index.to_numpy(dtype=float)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/_base.py:949: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/driscoll/mambaforge/envs/219/lib/python3.11/site-packages/seaborn/categorical.py:640: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  positions = grouped.grouper.result_index.to_numpy(dtype=float)

# Count the occurrences of each unique value in the 'injured' column
injured_counts = crashes['injured'].value_counts().sort_index()

# Define the bins for the bar graph
bins = [-1, 0, 1, 2, 4, 6,8, 10, 20]

# Group the counts into the defined bins
grouped_counts = injured_counts.groupby(pd.cut(injured_counts.index, bins)).sum()

grouped_counts.index = ['0', '1', '2', '3-4', '5-6', '7-8', '9-10', '11-20']

# Plot the bar graph
grouped_counts.plot(kind='bar', color='lightblue', edgecolor='black')

injured_pct = grouped_counts / grouped_counts.sum() * 100

for i, v in enumerate(grouped_counts):
    plt.text(i-0.2, v+500, v)
for i, v in enumerate(injured_pct):
    plt.text(i-0.22, 10000, '{:.1f}%'.format(v))

# Set the labels and title
plt.title('Number of Crashes by Number of Injuries')
plt.xlabel('Number of Injuries')
plt.ylabel('Number of Crashes')
plt.xticks(rotation=0)
# Show the plot
plt.show()

/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_93493/687600154.py:8: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_counts = injured_counts.groupby(pd.cut(injured_counts.index, bins)).sum()

# Calculate the counts of killed
killed_counts = crashes['killed'].value_counts().sort_index()

# Calculate the percentage of killed
killed_pct = (killed_counts / len(crashes)) * 100

# Create a DataFrame for killed counts and percentages
killed_data = pd.DataFrame({'Counts': killed_counts, 'Percentage': killed_pct})

# Plot the counts of killed
plt.figure(figsize=(10, 5))
sns.barplot(x=killed_data.index, y=killed_data['Counts'], color='red', edgecolor='black')
plt.title('Counts of Killed in Crashes')
plt.xlabel('Number of Killed')
plt.ylabel('Counts')

# Enumerate the percentages as text on the graph
for i, count in enumerate(killed_data['Counts']):
    plt.text(i, count, killed_data["Counts"][i], ha='center', va='bottom')
for i, count in enumerate(killed_data['Counts']):
    plt.text(i, 20000, f'{killed_data["Percentage"][i]:.2f}%', ha='center', va='bottom')

plt.show()

# Create a bar graph for pedestrians injured and pedestrians killed
killed_counts = crashes['pedestrians_killed'].value_counts().sort_index()
injured_counts = crashes['pedestrians_injured'].value_counts().sort_index()
killed_counts = killed_counts.reindex(injured_counts.index, fill_value=0)

plt.figure(figsize=(10, 5))
killed_counts.plot(kind='bar', color='red', edgecolor='black', position=0, width=0.4, label='Pedestrians Killed')
injured_counts.plot(kind='bar', color='blue', edgecolor='black', position=1, width=0.4, label='Pedestrians Injured')
plt.title('Number of Crashes by Pedestrians Injured and Killed')
plt.xlabel('Number of Pedestrians')
plt.ylabel('Number of Crashes')
plt.xticks(rotation=0)
plt.legend(loc='upper right')
for i, v in enumerate(killed_counts):
    plt.text(i+0.05, v+800, v, color = 'red')

for i, v in enumerate(injured_counts):
    plt.text(i-.38, v+880, v, color = 'blue')

plt.show()

motor_killed = crashes['motorists_killed'].value_counts().sort_index()
motor_injured = crashes['motorists_injured'].value_counts().sort_index()
cycle_killed = crashes['cyclists_killed'].value_counts().sort_index()
cycle_injured = crashes['cyclists_injured'].value_counts().sort_index()

motor_killed = motor_killed.reindex(motor_injured.index, fill_value=0)
cycle_killed = cycle_killed.reindex(cycle_injured.index, fill_value=0)

fig, ax = plt.subplots(1, 2, figsize=(20, 5))

motor_killed.plot(kind='bar', color='red', edgecolor='black', position=0, width=0.4, label='Motorists Killed', ax=ax[0])
motor_injured.plot(kind='bar', color='blue', edgecolor='black', position=1, width=0.4, label='Motorists Injured', ax=ax[0])
ax[0].set_title('Number of Crashes by Motorists Injured and Killed')
ax[0].set_xlabel('Number of Motorists')
ax[0].set_ylabel('Number of Crashes')
ax[0].legend(loc='upper right')
for i, v in enumerate(motor_killed):
    ax[0].text(i+0.05, v+800, v, color = 'red', rotation=45)
for i, v in enumerate(motor_injured):
    ax[0].text(i-.38, v+880, v, color = 'blue', rotation=45)

cycle_killed.plot(kind='bar', color='red', edgecolor='black', position=0, width=0.4, label='Cyclists Killed', ax=ax[1])
cycle_injured.plot(kind='bar', color='blue', edgecolor='black', position=1, width=0.4, label='Cyclists Injured', ax=ax[1])
ax[1].set_title('Number of Crashes by Cyclists Injured and Killed')
ax[1].set_xlabel('Number of Cyclists')
ax[1].set_ylabel('Number of Crashes')
ax[1].legend(loc='upper right')
for i, v in enumerate(cycle_killed):
    ax[1].text(i+0.05, v+800, v, color = 'red')
for i, v in enumerate(cycle_injured):
    ax[1].text(i-.38, v+880, v, color = 'blue')

plt.show()

commonFactors = crashes['factor1'].value_counts().head(10)
commonFactors.plot(kind='bar', color='purple', edgecolor='black')
plt.title('Top 10 Most Common Contributing Factors')
plt.xlabel('Contributing Factor')
plt.ylabel('Number of Crashes')
plt.xticks(rotation=45, ha='right')
plt.axhline(crashes['factor1'].value_counts().mean(), color='red', linestyle='dashdot', label='Average')
plt.text(9.6, crashes['factor1'].value_counts().mean()-200, 'Average: {:.0f}'.format(crashes['factor1'].value_counts().mean()))
plt.legend(loc='upper right')
for i, v in enumerate(commonFactors):
    plt.text(i-0.23, v+300, v)

plt.show()
display( pd.DataFrame(crashes['factor1'].value_counts()).transpose())

vehiclecounts = crashes['vehicle1'].value_counts()
vehiclecounts = crashes['vehicle1'].value_counts() + crashes['vehicle2'].value_counts().reindex( vehiclecounts.index, fill_value=0) + crashes['vehicle3'].value_counts().reindex( vehiclecounts.index, fill_value=0) + crashes['vehicle4'].value_counts().reindex( vehiclecounts.index, fill_value=0) + crashes['vehicle5'].value_counts().reindex( vehiclecounts.index, fill_value=0)
vehiclecounts.index = vehiclecounts.index.str.capitalize()
vehiclecounts = vehiclecounts.groupby(level=0).sum()
vehiclecounts= vehiclecounts.sort_values(ascending=False).drop('None')

crashes.replace('Station Wagon/Sport Utility Vehicle', 'SUV', inplace=True)

commonVehicles = vehiclecounts.head(10)
commonVehicles['Other'] = vehiclecounts[10:].sum()
commonVehicles.rename(index={'Station wagon/sport utility vehicle': 'SUV'}, inplace=True)

commonVehicles.plot(kind='bar', color='orange', edgecolor='black')
plt.title('Top 10 Most Common Vehicle Types')
plt.xlabel('Vehicle Type')
plt.ylabel('Number of Crashes')
plt.xticks(rotation=45, ha='right')
plt.legend(loc='upper right')
for i, v in enumerate(commonVehicles):
    plt.text(i-0.23, v+300, v)
plt.show()

display(pd.DataFrame(vehiclecounts).transpose())

numvehicles = crashes['vehicles_involved'].value_counts()
numvehicles.sort_index(inplace=True)
numvehicles.plot(kind='bar', color='lightgreen', edgecolor='black')
plt.title('Number of Crashes by Number of Vehicles Involved')
plt.xlabel('Number of Vehicles')
plt.ylabel('Number of Crashes')
plt.xticks(rotation=0)
plt.axhline(numvehicles.mean(), color='red', linestyle='dashdot', label='Average')
plt.text(4.6, numvehicles.mean(), 'Average: {:.0f}'.format(numvehicles.mean()))
plt.legend(loc='upper right')
for i, v in enumerate(numvehicles):
    plt.text(i-0.15, v+500, v)
plt.show()

boroughinjuries = crashes.groupby('borough')['injured'].value_counts(bins= [-1, 0, 1, 2, 4, 6,8, 10, 20], sort=False).fillna(0)
plt.subplots(2, 3, figsize=(20, 10))


for i, b in enumerate(crashes['borough'].unique()):
    totalforborough = crashes[crashes['borough'] == b].shape[0]
    plt.subplot(2, 3, i+1)
    boroughinjuries[b].plot(kind='bar', color='lightblue', edgecolor='black', label='Injured')
    plt.title('Number of Crashes by Number of Injuries in {}'.format(b))
    plt.xlabel('Number of Injuries')
    plt.ylabel('Number of Crashes')
    plt.xticks(rotation=0)
    plt.gca().set_xticklabels(['0', '1', '2', '3-4', '5-6', '7-8', '9-10', '11-20'])
    plt.legend(loc='upper right')
    # plotting total crashes per borough
    plt.text(4.8, totalforborough*0.3, 'Total Crashes: {}'.format(totalforborough))
    for j, v in enumerate(boroughinjuries[b]):
        plt.text(j-0.25, v+80 - (60 if b == "STATEN ISLAND" else 0), '{:.2f}%'.format(v/totalforborough*100))

plt.show()

timeofdayinjuries = crashes.groupby('timeofday')['injured'].value_counts(bins= [-1, 0, 1, 2, 4, 6,8, 10, 20], sort=False).fillna(0)
plt.subplots(2, 2, figsize=(20, 10))

for i, t in enumerate(crashes['timeofday'].unique()):
    totalfortime = crashes[crashes['timeofday'] == t].shape[0]
    plt.subplot(2, 2, i+1)
    timeofdayinjuries[t].plot(kind='bar', color='lightgreen', edgecolor='black', label='Injured')
    plt.title('Number of Crashes by Number of Injuries in the {}'.format(t))
    plt.xlabel('Number of Injuries')
    plt.ylabel('Number of Crashes')
    plt.xticks(rotation=0)
    plt.gca().set_xticklabels(['0', '1', '2', '3-4', '5-6', '7-8', '9-10', '11-20'])
    plt.legend(loc='upper right')
    plt.text(5.8, totalfortime*0.3, 'Total Crashes: {}'.format(totalfortime), ha = 'left')
    for j, v in enumerate(timeofdayinjuries[t]):
        plt.text(j-0.25, v+80, '{:.2f}%'.format(v/totalfortime*100))
plt.show()

/var/folders/gc/0752xrm56pnf0r0dsrn5370c0000gr/T/ipykernel_93493/3326097605.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  timeofdayinjuries = crashes.groupby('timeofday')['injured'].value_counts(bins= [-1, 0, 1, 2, 4, 6,8, 10, 20], sort=False).fillna(0)

#getting sample of data involving trucks, e-bikes, and motorcycles ("Dangerous" vehicles)
truckssample = crashes[crashes['vehicle1'].str.contains('Truck') | crashes['vehicle2'].str.contains('Truck') | crashes['vehicle3'].str.contains('Truck') | crashes['vehicle4'].str.contains('Truck') | crashes['vehicle5'].str.contains('Truck')].sample(1000)
ebikesample = crashes[crashes['vehicle1'].str.contains('E-Bike') | crashes['vehicle2'].str.contains('E-Bike') | crashes['vehicle3'].str.contains('E-Bike') | crashes['vehicle4'].str.contains('E-Bike') | crashes['vehicle5'].str.contains('E-Bike')].sample(1000)
motorcyclesample = crashes[crashes['vehicle1'].str.contains('Motorcycle') | crashes['vehicle2'].str.contains('Motorcycle') | crashes['vehicle3'].str.contains('Motorcycle') | crashes['vehicle4'].str.contains('Motorcycle') | crashes['vehicle5'].str.contains('Motorcycle')].sample(1000)

#getting sample of data involving sedans, SUVs, and taxis ("Safe" vehicles)
sedansample = crashes[crashes['vehicle1'].str.contains('Sedan') | crashes['vehicle2'].str.contains('Sedan') | crashes['vehicle3'].str.contains('Sedan') | crashes['vehicle4'].str.contains('Sedan') | crashes['vehicle5'].str.contains('Sedan')].sample(1000)
suvsample = crashes[crashes['vehicle1'].str.contains('SUV') | crashes['vehicle2'].str.contains('SUV') | crashes['vehicle3'].str.contains('SUV') | crashes['vehicle4'].str.contains('SUV') | crashes['vehicle5'].str.contains('SUV')].sample(1000)
taxisample = crashes[crashes['vehicle1'].str.contains('Taxi') | crashes['vehicle2'].str.contains('Taxi') | crashes['vehicle3'].str.contains('Taxi') | crashes['vehicle4'].str.contains('Taxi') | crashes['vehicle5'].str.contains('Taxi')].sample(1000)


#plotting average injury and killed counts for each vehicle type, dividing based on safety
plt.figure(figsize=(10, 5))
plt.bar('Truck', truckssample['injured'].mean(), color='blue', label='Dangerous-Injured')
plt.bar('Truck', truckssample['killed'].mean(), color='red', label='Dangerous-Killed')
plt.bar('E-Bike', ebikesample['injured'].mean(), color='blue')
plt.bar('E-Bike', ebikesample['killed'].mean(), color='red')
plt.bar('Motorcycle', motorcyclesample['injured'].mean(), color='blue')
plt.bar('Motorcycle', motorcyclesample['killed'].mean(), color='red')
plt.bar('Sedan', sedansample['injured'].mean(), color='green', label = 'Safe-Injured')
plt.bar('Sedan', sedansample['killed'].mean(), color='purple', label = 'Safe-Killed')
plt.bar('SUV', suvsample['injured'].mean(), color='green')
plt.bar('SUV', suvsample['killed'].mean(), color='purple')
plt.bar('Taxi', taxisample['injured'].mean(), color='green')
plt.bar('Taxi', taxisample['killed'].mean(), color='purple')
plt.title('Average Number of Injuries and Deaths by Vehicle Type')
plt.xlabel('Vehicle Type')
plt.ylabel('Number of Persons')
plt.xticks(rotation=0)
plt.legend(loc='upper right')
plt.show()

#plotting average injury and killed counts for some contributing factors, dividing based on safety
sleepsample = crashes[crashes['factor1'].str.contains('Asleep') | crashes['factor2'].str.contains('Asleep') | crashes['factor3'].str.contains('Asleep')].sample(300)
alcoholsample = crashes[crashes['factor1'].str.contains('Alcohol') | crashes['factor2'].str.contains('Alcohol') | crashes['factor3'].str.contains('Alcohol')].sample(300)

innatention = crashes[crashes['factor1'].str.contains('Inattention') | crashes['factor2'].str.contains('Inattention') | crashes['factor3'].str.contains('Inattention')].sample(300)
backing = crashes[crashes['factor1'].str.contains('Backing') | crashes['factor2'].str.contains('Backing') | crashes['factor3'].str.contains('Backing')].sample(300)

plt.figure(figsize=(10, 5))
plt.bar('Sleeping', sleepsample['injured'].mean(), color='blue', label='Dangerous-Injured')
plt.bar('Sleeping', sleepsample['killed'].mean(), color='red', label='Dangerous-Killed')
plt.bar('Alcohol', alcoholsample['injured'].mean(), color='blue')
plt.bar('Alcohol', alcoholsample['killed'].mean(), color='red')
plt.bar('Inattention', innatention['injured'].mean(), color='green', label = 'Safe-Injured')
plt.bar('Inattention', innatention['killed'].mean(), color='purple', label = 'Safe-Killed')
plt.bar('Backing', backing['injured'].mean(), color='green')
plt.bar('Backing', backing['killed'].mean(), color='purple')
plt.title('Average Number of Injuries and Deaths by Contributing Factor')
plt.xlabel('Contributing Factor')
plt.ylabel('Number of Persons')
plt.xticks(rotation=0)
plt.legend(loc='upper right')
plt.show()

#getting samples of different vehicles involved values
zerovehicles = crashes[crashes['vehicles_involved'] == 0].sample(800)
onevehicle = crashes[crashes['vehicles_involved'] == 1].sample(800)
twovehicles = crashes[crashes['vehicles_involved'] == 2].sample(800)
threevehicles = crashes[crashes['vehicles_involved'] == 3].sample(800)
fourvehicles = crashes[crashes['vehicles_involved'] == 4].sample(800)
fivevehicles = crashes[crashes['vehicles_involved'] == 5].sample(800)

#plotting average injury and killed counts for each vehicle type, dividing based on safety
plt.figure(figsize=(10, 5))
plt.bar('0', zerovehicles['injured'].mean(), color='blue', label='Injured')
plt.bar('0', zerovehicles['killed'].mean(), color='red', label='Killed')
plt.bar('1', onevehicle['injured'].mean(), color='blue')
plt.bar('1', onevehicle['killed'].mean(), color='red')
plt.bar('2', twovehicles['injured'].mean(), color='blue')
plt.bar('2', twovehicles['killed'].mean(), color='red')
plt.bar('3', threevehicles['injured'].mean(), color='blue')
plt.bar('3', threevehicles['killed'].mean(), color='red')
plt.bar('4', fourvehicles['injured'].mean(), color='blue')
plt.bar('4', fourvehicles['killed'].mean(), color='red')
plt.bar('5', fivevehicles['injured'].mean(), color='blue')
plt.bar('5', fivevehicles['killed'].mean(), color='red')
plt.title('Average Number of Injuries and Deaths by Number of Vehicles Involved')
plt.xlabel('Number of Vehicles')
plt.ylabel('Number of Persons')
plt.xticks(rotation=0)
plt.legend(loc='upper right')
plt.show()

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Step 1: Prepare the data
features = crashes[['factor1', 'factor2', 'factor3', 'vehicle1', 'vehicle2', 'vehicle3', 'vehicle4','vehicle5', 'vehicles_involved']]
target = crashes['injured']  # Modify target to be a single column

# Step 2: Encode categorical variables
encoder = OneHotEncoder()
features_encoded = encoder.fit_transform(features)

# Step 3: Split the data
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.2, random_state=6602)

# Step 4: Train the KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Step 5: Evaluate the model
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.68625

#Making pie charts based on injury data and time of day 
injury0sample = crashes[crashes['injured'] == 0].sample(800)
injury1sample = crashes[crashes['injured'] == 1].sample(800)
injury23sample = crashes[crashes['injured'].isin([2, 3])].sample(800)
injury4upsample = crashes[crashes['injured']>=4].sample(800)

plt.figure(figsize=(10, 10))
plt.subplot(2, 2, 1)
injury0sample['timeofday'].value_counts().sort_index().plot(kind='pie', autopct='%1.1f%%', colors=['lightblue', 'lightgreen', 'lightcoral', 'lightyellow'])
plt.title('Time of Day for Crashes with 0 Injuries')
plt.ylabel('')
plt.subplot(2, 2, 2)
injury1sample['timeofday'].value_counts().sort_index().plot(kind='pie', autopct='%1.1f%%', colors=['lightblue', 'lightgreen', 'lightcoral', 'lightyellow'])
plt.title('Time of Day for Crashes with 1 Injury')
plt.ylabel('')
plt.subplot(2, 2, 3)
injury23sample['timeofday'].value_counts().sort_index().plot(kind='pie', autopct='%1.1f%%', colors=['lightblue', 'lightgreen', 'lightcoral', 'lightyellow'])
plt.title('Time of Day for Crashes with 2-3 Injuries')
plt.ylabel('')
plt.legend(loc='lower left')
plt.subplot(2, 2, 4)
injury4upsample['timeofday'].value_counts().sort_index().plot(kind='pie', autopct='%1.1f%%', colors=['lightblue', 'lightgreen', 'lightcoral', 'lightyellow'])
plt.title('Time of Day for Crashes with 4 or More Injuries')
plt.ylabel('')
plt.show()

features = crashes[['injured', 'motorists_injured', 'killed','motorists_killed', 'vehicles_involved']]
target = crashes['timeofday']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.2356

	crash_date	crash_time	borough	zip_code	latitude	longitude	location	on_street_name	off_street_name	cross_street_name	number_of_persons_injured	number_of_motorist_injured	contributing_factor_vehicle_1	contributing_factor_vehicle_2	contributing_factor_vehicle_3	contributing_factor_vehicle_4	contributing_factor_vehicle_5	collision_id	vehicle_type_code1	vehicle_type_code2	vehicle_type_code_3	vehicle_type_code_4	vehicle_type_code_5
0	2021-09-11T00:00:00.000	2:39	NaN	NaN	NaN	NaN	NaN	WHITESTONE EXPRESSWAY	20 AVENUE	NaN	2	2	Aggressive Driving/Road Rage	Unspecified	NaN	NaN	NaN	4455765	Sedan	Sedan	NaN	NaN	NaN
1	2022-03-26T00:00:00.000	11:45	NaN	NaN	NaN	NaN	NaN	QUEENSBORO BRIDGE UPPER	NaN	NaN	1	1	Pavement Slippery	NaN	NaN	NaN	NaN	4513547	Sedan	NaN	NaN	NaN	NaN
2	2022-06-29T00:00:00.000	6:55	NaN	NaN	NaN	NaN	NaN	THROGS NECK BRIDGE	NaN	NaN	0	0	Following Too Closely	Unspecified	NaN	NaN	NaN	4541903	Sedan	Pick-up Truck	NaN	NaN	NaN
3	2021-09-11T00:00:00.000	9:35	BROOKLYN	11208.0	40.667202	-73.866500	\n, \n(40.667202, -73.8665)	NaN	NaN	1211 LORING AVENUE	0	0	Unspecified	NaN	NaN	NaN	NaN	4456314	Sedan	NaN	NaN	NaN	NaN
4	2021-12-14T00:00:00.000	8:13	BROOKLYN	11233.0	40.683304	-73.917274	\n, \n(40.683304, -73.917274)	SARATOGA AVENUE	DECATUR STREET	NaN	0	0	NaN	NaN	NaN	NaN	NaN	4486609	NaN	NaN	NaN	NaN	NaN

Introduction¶

Imports¶

Dataset¶

Preprocessing¶

Column Deletion¶

Column Addition and Null Conversion¶

Outlier Analysis¶

Summary Data Analysis¶

Time Data¶

Year¶

Month¶

Day¶

Time¶

Holidays¶

Location Data¶

Borough¶

Street¶

Fatality/Injury Data¶

Total Injuries¶

Total Fatalities¶

Pedestrian Statistics¶

Motorist and Cyclist Statistics¶

Factors and Vehicles¶

Contributing Factors¶

Vehicle Models¶

Number of Vehicles Involved¶

Interesting Correlations¶

Discussion¶

Question 1:¶

Graphical analysis¶

kNN Model Test¶

Conclusion:¶

Question 2:¶

Graphical Analysis¶

kNN Model Test¶

Conclusion:¶

Conclusion¶

	crash_date	crash_time	borough	on_street_name	number_of_persons_injured	number_of_cyclist_injured	contributing_factor_vehicle_1	contributing_factor_vehicle_2	contributing_factor_vehicle_3	vehicle_type_code1	vehicle_type_code2	vehicle_type_code_3	vehicle_type_code_4	vehicle_type_code_5	year	day	month	timeofday
0	2012-07-27	1253	BROOKLYN	RALPH AVENUE	0	0	Failure to Yield Right-of-Way	None	None	Station Wagon/Sport Utility Vehicle	E-Scooter	None	None	None	2012	Friday	July	Evening
1	2012-08-01	622	BROOKLYN	PITKIN AVENUE	1	1	None	None	None	Station Wagon/Sport Utility Vehicle	Bike	None	None	None	2012	Wednesday	August	Morning
2	2012-09-25	756	QUEENS	WEIRFIELD STREET	0	0	Prescription Medication	None	None	Station Wagon/Sport Utility Vehicle	Station Wagon/Sport Utility Vehicle	None	None	None	2012	Tuesday	September	Afternoon