import pandas as pd
from IPython.display import display, HTML
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt
import datapane as dp
import altair as alt
import numpy as np
from datetime import date, time

Where is the deadliest place for cyclist in NYC?? And Why??

Big picture of collision incidents on the streets.

All the datasets came from NYC Open datawebsite

df_collision_crashes = pd.read_csv('Motor_Vehicle_Collisions_-_Crashes.csv',low_memory=False)
display(df_collision_crashes.head())

display(df_collision_crashes.shape)

(1818612, 29)

display(df_collision_crashes.info(show_counts=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1818612 entries, 0 to 1818611
Data columns (total 29 columns):
 #   Column                         Non-Null Count    Dtype  
---  ------                         --------------    -----  
 0   CRASH DATE                     1818612 non-null  object 
 1   CRASH TIME                     1818612 non-null  object 
 2   BOROUGH                        1257468 non-null  object 
 3   ZIP CODE                       1257252 non-null  object 
 4   LATITUDE                       1605282 non-null  float64
 5   LONGITUDE                      1605282 non-null  float64
 6   LOCATION                       1605282 non-null  object 
 7   ON STREET NAME                 1448206 non-null  object 
 8   CROSS STREET NAME              1168335 non-null  object 
 9   OFF STREET NAME                278037 non-null   object 
 10  NUMBER OF PERSONS INJURED      1818594 non-null  float64
 11  NUMBER OF PERSONS KILLED       1818581 non-null  float64
 12  NUMBER OF PEDESTRIANS INJURED  1818612 non-null  int64  
 13  NUMBER OF PEDESTRIANS KILLED   1818612 non-null  int64  
 14  NUMBER OF CYCLIST INJURED      1818612 non-null  int64  
 15  NUMBER OF CYCLIST KILLED       1818612 non-null  int64  
 16  NUMBER OF MOTORIST INJURED     1818612 non-null  int64  
 17  NUMBER OF MOTORIST KILLED      1818612 non-null  int64  
 18  CONTRIBUTING FACTOR VEHICLE 1  1813434 non-null  object 
 19  CONTRIBUTING FACTOR VEHICLE 2  1556895 non-null  object 
 20  CONTRIBUTING FACTOR VEHICLE 3  123965 non-null   object 
 21  CONTRIBUTING FACTOR VEHICLE 4  27014 non-null    object 
 22  CONTRIBUTING FACTOR VEHICLE 5  7134 non-null     object 
 23  COLLISION_ID                   1818612 non-null  int64  
 24  VEHICLE TYPE CODE 1            1808784 non-null  object 
 25  VEHICLE TYPE CODE 2            1508860 non-null  object 
 26  VEHICLE TYPE CODE 3            120369 non-null   object 
 27  VEHICLE TYPE CODE 4            26216 non-null    object 
 28  VEHICLE TYPE CODE 5            6943 non-null     object 
dtypes: float64(4), int64(7), object(18)
memory usage: 402.4+ MB

None

There are no any na value in CRASH DATE

df_collision_crashes['CRASH DATE'].isna().sum()

0

Let's transform CRASH DATA column into datetime format in order to easily manipulate data.

df_collision_crashes['CRASH DATE'] = pd.to_datetime(df_collision_crashes['CRASH DATE'])
df_collision_crashes.head()

Observe overall crashes accidents over these years

Add 'year' column

df_collision_crashes['year'] = df_collision_crashes['CRASH DATE'].dt.year
df_collision_crashes.head()

From the plot below, it seems like crash incidents increase over time, and there was a drastic surge between 2012 to 2013. Unsurprisingly, crash incidents suddenly decrease, it might be due to the COVID-19 pandemic.

ax = sns.countplot(data=df_collision_crashes, x='year')
ax.set(ylabel='crash counts')

[Text(0, 0.5, 'crash counts')]

Delve into the crashes that have injuries/fatalities of cyclists

Read another datasets

#collapse_show
df_bike_counts = pd.read_csv('Bicycle_Counts.csv',low_memory=False)
display(df_bike_counts.head())
display(df_bike_counts.shape)
df_bike_counters = pd.read_csv('Bicycle_Counters.csv',low_memory=False)
display(df_bike_counters.head())
display(df_bike_counters.shape)

(3950246, 5)

(23, 9)

Given overall number of cyclists also increase, it makes sense that crash incidents increased as well. However, it is interesting to observer that the number of overall cyclists did not decrease proportionally between 2019 to 2020 after the breakout of COVID-19, it is a point worth another investigation.

df_bike_counts['date'] = pd.to_datetime(df_bike_counts['date'])
df_bike_counts['year'] = df_bike_counts['date'].dt.year
df_bike_totalcounts = df_bike_counts.groupby(by=['year']).sum().reset_index()[['counts', 'year']]
df_bike_totalcounts['year'] = df_bike_totalcounts['year'].map(str)
ax = sns.barplot(data=df_bike_totalcounts, x='year', y='counts')
ax.set(ylabel='Bike counts')

ax = sns.barplot(data=df_bike_totalcounts, x='year', y='counts')
ax.set(ylabel='Bike counts')

[Text(0, 0.5, 'Bike counts')]

Observe how many percentage of cyclists suffer injuries and fatalities over all of victims

df_bike_counts.head()

casualties_plus_year_cols = ['NUMBER OF CYCLIST INJURED',
                              'NUMBER OF CYCLIST KILLED',
                             'NUMBER OF PERSONS INJURED',
                              'NUMBER OF PERSONS KILLED', 
                              'NUMBER OF PEDESTRIANS INJURED',
                             'NUMBER OF PEDESTRIANS KILLED',
                             'NUMBER OF MOTORIST INJURED',
                             'NUMBER OF MOTORIST KILLED'
                            ]

df_casualties = df_collision_crashes.groupby(by=[df_collision_crashes.year]).sum()[casualties_plus_year_cols]
df_casualties['year'] = df_casualties.index.map(str)
df_casualties = df_casualties.reset_index(drop=True)
df_casualties['NUMBER of TOTAL CASUALTIES'] = df_casualties[['NUMBER OF PERSONS INJURED',	'NUMBER OF PERSONS KILLED']].sum(axis=1)
df_casualties['NUMBER of TOTAL CYCLISTS CASUALTIES'] = df_casualties[['NUMBER OF CYCLIST INJURED',	'NUMBER OF CYCLIST KILLED']].sum(axis=1)
df_casualties['NUMBER of TOTAL PEDESTRIANS CASUALTIES'] = df_casualties[['NUMBER OF PEDESTRIANS INJURED',	'NUMBER OF PEDESTRIANS KILLED']].sum(axis=1)
df_casualties['NUMBER of TOTAL MOTORIST CASUALTIES'] = df_casualties[['NUMBER OF MOTORIST INJURED',	'NUMBER OF MOTORIST KILLED']].sum(axis=1)


# df_casualties.drop(columns=casualties_plus_year_cols[2:], inplace=True)
df_casualties.head()

df_bike_totalcounts.head()

# df_casualties merge with df_bike_totalcounts
df_casualties = df_casualties.merge(df_bike_totalcounts, on='year', how='right')
df_casualties

2019 is clearly the deadliest year for cyclists. However, given growing number of injured cyclists, it is still not clear why there is drastic change of fatal accidents in both 2018 and 2019. And even though 2020 had less fatal incidents than 2019, more injuries actually arose in 2020.

f, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 8))
sns.barplot(data=df_casualties, x='year', y='NUMBER OF CYCLIST INJURED', ax=axes[0])
sns.barplot(data=df_casualties, x='year', y='NUMBER OF CYCLIST KILLED', ax=axes[1])
plt.show()
# ax.set(ylabel='Bike counts')

The figure below show how many percentage of people are cyclists among all the injuries and fatalities.

from pdb import set_trace
def annotate_percentage(axis, df):
    for i, p in enumerate(axis.patches):
        if i > 9 and i < 20:
            percentage = '{:.1f}%'.format(100 * df['NUMBER of TOTAL CYCLISTS CASUALTIES'][i - 10] / df['NUMBER of TOTAL CASUALTIES'][i - 10])
            x = p.get_width()
            y = p.get_y() + p.get_height() - 0.2
            axis.annotate(percentage, (x, y), size = 200)
        else:
            continue
        
    
sns.set_theme(style="whitegrid")

# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(130, 80))

# Plot the total crashes
sns.set_color_codes("pastel")
sns.barplot(x="NUMBER of TOTAL CASUALTIES", y="year", data=df_casualties,
            label="Total Casualties", color="b")
# Plot the crashes where cyclists were victims
sns.set_color_codes("muted")
sns.barplot(x="NUMBER of TOTAL CYCLISTS CASUALTIES", y="year", data=df_casualties,
            label="Total Cyclists Casualties", color="b")
annotate_percentage(ax, df_casualties)

# Add a legend and informative axis label
ax.legend(ncol=2, loc="upper right", frameon=True, fontsize='120')
ax.set(xlim=(0, 65000), ylabel="years",
       xlabel="Number of Casualties")
ax.set_ylabel("years", fontsize=200)
ax.set_xlabel("Number of Casualties", fontsize=200)
ax.tick_params(axis='x', labelsize=80)
ax.tick_params(axis='y', labelsize=80)


sns.despine(left=True, bottom=True)

Comparison of change rate between bike counts and injuries/fatalities of cyclists over these years

In the plot below, we ignore data from 2012 and 2021 because this dataset is not yet completely collected at the time of data collection.

Conclusion: Casualties trend roughly follow the change of bike counts all around new york city. Essentially, the more new yorkers biking on the street more casualties of cyclists produced. And the most peculiar thing is that the number of cyclists casualties still rise even when the bike counts decrease in 2019. That is how the issue of cyclists safety in new york city get noticed by news coverage and city government even consider the idea of licensing of biking in the city.

plt.figure(figsize=(15, 10))
sns.set_theme(style="whitegrid")
ax = sns.barplot(data=df_casualties[(df_casualties.year != '2012') & (df_casualties.year != '2021')], x="year", y='counts', color='g')
ax.set_ylabel('Bike Counts', color='g', fontsize=20)
ax.set_xlabel('year', fontsize=20)


ax2 = plt.twinx()
sns.lineplot(data=df_casualties[(df_casualties.year != '2012') & (df_casualties.year != '2021')], x="year", y='NUMBER of TOTAL CYCLISTS CASUALTIES', color='r', ax=ax2)
ax2.set_ylabel('Number of Cyclists Casualties', color='r', fontsize=20)
ax2.spines['right'].set_color('r')
ax2.spines['right'].set_linewidth(3)
ax2.spines['left'].set_color('g')
ax2.spines['left'].set_linewidth(3)


# sns.despine(left=True, bottom=True)

Breaking down which borough have highest casualties over these years

print(f'There are roughly {(100 * df_collision_crashes.BOROUGH.isna().sum() / len(df_collision_crashes.BOROUGH)):.2f}% missing instances in BOUROUGH column')
display(df_collision_crashes.groupby(by=['BOROUGH']).sum()[casualties_plus_year_cols].style.set_caption("Sum of Casualties by Bourough from 2012 to 2021").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('font-size', '16px')
    ]
}]))
print('======================================================================')
display(df_collision_crashes.groupby(by=['BOROUGH', 'year']).sum()[casualties_plus_year_cols].style.set_caption("Sum of Casualties by Bourough and Years(2012 to 2021)").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('font-size', '16px')
    ]
}]))

There are roughly 30.86% missing instances in BOUROUGH column

======================================================================

# df_cyclists_borough_yearly = df_collision_crashes.groupby(by=['BOROUGH', 'year']).sum()[casualties_plus_year_cols].reset_index()[['NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED', 'year', 'BOROUGH']]
df_cyclists_borough_yearly = df_collision_crashes.groupby(by=['BOROUGH', 'CRASH DATE']).sum()[casualties_plus_year_cols].reset_index()[['NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED', 'CRASH DATE', 'BOROUGH']]
df_cyclists_borough_yearly['Total Casualties of Cyclists'] = df_cyclists_borough_yearly[['NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED']].sum(axis=1)
df_cyclists_borough_yearly.head()

Overall, BROOKLYN has the highest casualities

alt.data_transformers.disable_max_rows()
plot1 = alt.Chart(df_cyclists_borough_yearly).mark_bar().encode(
    x='year(CRASH DATE):O',
    y='sum(Total Casualties of Cyclists):Q',
    color='BOROUGH:N',
).facet(
    facet='BOROUGH:N',
    columns=3
)

dp.Report(
    dp.Plot(plot1),
).upload(name="Plot1")

HTML('<iframe src="https://datapane.com/u/jason0/reports/M7bQOm3/plot1/embed/" width="60%" height="540px" style="border: none;">IFrame not supported</iframe>')

plot2 = alt.Chart(df_cyclists_borough_yearly).mark_bar().encode(
    x='year(CRASH DATE):O',
    y='sum(NUMBER OF CYCLIST KILLED):Q',
    color='BOROUGH:N',
).facet(
    facet='BOROUGH:N',
    columns=3
)

dp.Report(
    dp.Plot(plot2),
).upload(name="Plot2")

HTML('<iframe src="https://datapane.com/u/jason0/reports/q34OzGA/plot2/embed/" width="60%" height="540px" style="border: none;">IFrame not supported</iframe>')

In conclusion, Brooklyn take up a significant proportion of casualties both in injuries and deaths. All of boroughs slightly show peaks between 2014 to 2017 and 2019 to 2020.

Breaking down at which month/day/time casualties concentrate

death_cols = ['CRASH DATE', 'CRASH TIME', 'BOROUGH', 'LATITUDE',
       'LONGITUDE', 'LOCATION', 'NUMBER OF CYCLIST INJURED',
       'NUMBER OF CYCLIST KILLED', 'CONTRIBUTING FACTOR VEHICLE 1',
       'CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3',
       'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5',
       'COLLISION_ID', 'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2',
       'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5',
       'year', 'ZIP CODE']
df_casualties_with_time = df_collision_crashes[df_collision_crashes[['NUMBER OF CYCLIST KILLED', 'NUMBER OF CYCLIST INJURED']].sum(axis=1) != 0][death_cols]#.reset_index(drop=True)
df_casualties_with_time['CRASH DATE'] = pd.to_datetime(df_casualties_with_time[['CRASH DATE', 'CRASH TIME']].apply(lambda x: ' '.join(x.values.astype(str)), axis="columns"))
df_casualties_with_time_location = df_casualties_with_time.copy()
df_casualties_with_time.drop(columns=['CRASH TIME', 'LOCATION', 'LATITUDE', 'LONGITUDE'], inplace=True)
df_casualties_with_time.head()

Month versus Year distribution of Injured Cyclists

First, analyze the most dangerous borough to cycling, that is Brooklyn.

plot3 = alt.Chart(df_casualties_with_time).mark_circle().encode(
    alt.X('month(CRASH DATE):O'),
    alt.Y('year(CRASH DATE):O'),
    size=alt.Size('sum(NUMBER OF CYCLIST INJURED):Q', legend=alt.Legend(title='Sum of Injured Cyclists')),
    color='BOROUGH:N',
#     column='BOROUGH:N'
    facet=alt.Facet('BOROUGH:N', columns=2)
).properties(
    title='Month versus Year distribution of Injured Cyclists')

dp.Report(
    dp.Plot(plot3),
).upload(name="Plot3")

HTML('<iframe src="https://datapane.com/u/jason0/reports/dA68E47/plot3/embed/" width="60%" height="540px" style="border: none;">IFrame not supported</iframe>')

Month versus Year distribution of Killed Cyclists

plot4 = alt.Chart(df_casualties_with_time).mark_circle().encode(
    alt.X('month(CRASH DATE):O'),
    alt.Y('year(CRASH DATE):O'),
    size=alt.Size('sum(NUMBER OF CYCLIST KILLED):Q', legend=alt.Legend(title='Sum of Killed Cyclists')),
    color='BOROUGH:N',
).facet(
        facet='BOROUGH:N',
        columns=2,
).properties(
    title='Month versus Year distribution of Killed Cyclists')

dp.Report(
    dp.Plot(plot4),
).upload(name="Plot4")

HTML('<iframe src="https://datapane.com/u/jason0/reports/vAqwOm7/plot4/embed/" width="50%" height="450px" style="border: none;">IFrame not supported</iframe>')

Time versus Day distribution of Injured Cyclists

plot5 = alt.Chart(df_casualties_with_time).mark_circle().encode(
    alt.X('hours(CRASH DATE):O'),
    alt.Y('day(CRASH DATE):O'),
    size=alt.Size('sum(NUMBER OF CYCLIST INJURED):Q', legend=alt.Legend(title='Sum of Injured Cyclists')),
    color='BOROUGH:N',
).facet(
        facet='BOROUGH:N',
        columns=2,
).properties(
    title='Time versus Day distribution of Injured Cyclists')
dp.Report(
    dp.Plot(plot5),
).upload(name="Plot5")

HTML('<iframe src="https://datapane.com/u/jason0/reports/9AxprGA/plot5/embed/" width="80%" height="540px" style="border: none;">IFrame not supported</iframe>')

Time versus Day distribution of Killed Cyclists

plot6 = alt.Chart(df_casualties_with_time).mark_circle().encode(
    alt.X('hours(CRASH DATE):O'),
    alt.Y('day(CRASH DATE):O'),
    size=alt.Size('sum(NUMBER OF CYCLIST KILLED):Q', legend=alt.Legend(title='Sum of Killed Cyclists')),
    color='BOROUGH:N',
).facet(
        facet='BOROUGH:N',
        columns=2,
).properties(
    title='Time versus Day distribution of Killed Cyclists')

dp.Report(
    dp.Plot(plot6),
).upload(name="Plot6")

HTML('<iframe src="https://datapane.com/u/jason0/reports/wAw2ylk/plot6/embed/" width="80%" height="540px" style="border: none;">IFrame not supported</iframe>')

Spatial distribution of cyclists-involved crashes

print(f'Totoally, there are {(100 * df_casualties_with_time_location[~((df_casualties_with_time_location.LONGITUDE.between(-74.259, -73.59) & (df_casualties_with_time_location.LATITUDE.between(40.49, 40.93))))].shape[0] / df_casualties_with_time_location.shape[0]):.2f} % rows of data are anomalous(null or unusual value) Location data')

Totoally, there are 7.69 % rows of data are anomalous(null or unusual value) Location data

We define location column to be False when LOGITUDE and LATITUDE information is not available.

df_casualties_with_time_location['location'] = False
# Limit all of location infos to legitimate range of New York City
df_casualties_with_time_location.loc[(df_casualties_with_time_location.LONGITUDE.between(-74.259, -73.59) & df_casualties_with_time_location.LATITUDE.between(40.49, 40.93)), 'location'] = True
df_casualties_with_time_location.location.value_counts()

True     39400
False     3280
Name: location, dtype: int64

To quickly examine the distribution of these small part of unspecified LOCATION datasets, we plot 4 charts below with respect to time(year, month, day, hours)/Borough/casualties), and we observe:

In terms of time-related columns, these small parts of dataset did not look like to be skewing into any part of values.
For those datapoints which are null value in BOROUGH column, they incline to have unspecified LOCATION as well.

base = alt.Chart(df_casualties_with_time_location)

# xscale = alt.Scale(domain=(4.0, 8.0))
# yscale = alt.Scale(domain=(1.9, 4.55))

area_args = {'opacity': .3, 'interpolate': 'step'}

points = base.mark_circle().encode(
    alt.X('year:O'),
    alt.Y('hours(CRASH DATE):T'),
    color='location'
)

top_hist = base.mark_area(**area_args).encode(
    alt.X('year:O',
          # when using bins, the axis scale is set through
          # the bin extent, so we do not specify the scale here
          # (which would be ignored anyway)
          stack=None,
          title=''
         ),
    alt.Y('count()', stack=None, title=''),
    alt.Color('location:N'),
).properties(height=100)

right_hist = base.mark_area(**area_args).encode(
    alt.Y('hours(CRASH DATE):T',
          stack=None,
          title='',
         ),
    alt.X('count()', stack=None, title=''),
    alt.Color('location:N'),
).properties(width=100)

plot7 = (top_hist & (points | right_hist))

dp.Report(
    dp.Plot(plot7),
).upload(name="Plot7")

HTML('<iframe src="https://datapane.com/u/jason0/reports/8AVqaEk/plot7/embed/" width="50%" height="540px" style="border: none;">IFrame not supported</iframe>')

base = alt.Chart(df_casualties_with_time_location)

# xscale = alt.Scale(domain=(4.0, 8.0))
# yscale = alt.Scale(domain=(1.9, 4.55))

area_args = {'opacity': .3, 'interpolate': 'step'}

points = base.mark_circle().encode(
    alt.X('month(CRASH DATE):O'),
    alt.Y('day(CRASH DATE):O'),
    color='location'
)

top_hist = base.mark_area(**area_args).encode(
    alt.X('month(CRASH DATE):O',
          # when using bins, the axis scale is set through
          # the bin extent, so we do not specify the scale here
          # (which would be ignored anyway)
          stack=None,
          title=''
         ),
    alt.Y('count()', stack=None, title=''),
    alt.Color('location:N'),
).properties(height=100)

right_hist = base.mark_area(**area_args).encode(
    alt.Y('day(CRASH DATE):O',
          stack=None,
          title='',
         ),
    alt.X('count()', stack=None, title=''),
    alt.Color('location:N'),
).properties(width=100)

plot8 = (top_hist & (points | right_hist))
dp.Report(
    dp.Plot(plot8),
).upload(name="Plot8")

HTML('<iframe src="https://datapane.com/u/jason0/reports/dA9b0O3/plot8/embed/" width="50%" height="540px" style="border: none;">IFrame not supported</iframe>')

base = alt.Chart(df_casualties_with_time_location)

# xscale = alt.Scale(domain=(4.0, 8.0))
# yscale = alt.Scale(domain=(1.9, 4.55))

area_args = {'opacity': .3, 'interpolate': 'step'}

points = base.mark_circle().encode(
    alt.X('NUMBER OF CYCLIST KILLED:N'),
    alt.Y('BOROUGH:N'),
    color='location'
)

top_hist = base.mark_area(**area_args).encode(
    alt.X('BOROUGH:N',
          # when using bins, the axis scale is set through
          # the bin extent, so we do not specify the scale here
          # (which would be ignored anyway)
          stack=None,
          title=''
         ),
    alt.Y('count()', stack=None, title=''),
    alt.Color('location:N'),
).properties(height=100)

right_hist = base.mark_area(**area_args).encode(
    alt.Y('NUMBER OF CYCLIST KILLED:N',
          stack=None,
          title='',
         ),
    alt.X('count()', stack=None, title=''),
    alt.Color('location:N'),
).properties(width=100)

plot9 = (top_hist & (points | right_hist))
dp.Report(
    dp.Plot(plot9),
).upload(name="Plot9")

HTML('<iframe src="https://datapane.com/u/jason0/reports/nkKzMy7/plot9/embed/" width="50%" height="540px" style="border: none;">IFrame not supported</iframe>')

base = alt.Chart(df_casualties_with_time_location)

# xscale = alt.Scale(domain=(4.0, 8.0))
# yscale = alt.Scale(domain=(1.9, 4.55))

area_args = {'opacity': .3, 'interpolate': 'step'}

points = base.mark_circle().encode(
    alt.X('NUMBER OF CYCLIST INJURED:O'),
    alt.Y('BOROUGH:N'),
    color='location'
)

top_hist = base.mark_area(**area_args).encode(
    alt.X('NUMBER OF CYCLIST INJURED:O',
          # when using bins, the axis scale is set through
          # the bin extent, so we do not specify the scale here
          # (which would be ignored anyway)
          stack=None,
          title=''
         ),
    alt.Y('count()', stack=None, title=''),
    alt.Color('location:N'),
).properties(height=100)

right_hist = base.mark_area(**area_args).encode(
    alt.Y('BOROUGH:N',
          stack=None,
          title='',
         ),
    alt.X('count()', stack=None, title=''),
    alt.Color('location:N'),
).properties(width=100)

plot10 = (top_hist & (points | right_hist))
dp.Report(
    dp.Plot(plot10),
).upload(name="Plot10")

HTML('<iframe src="https://datapane.com/u/jason0/reports/P3XKWMA/plot10/embed/" width="50%" height="540px" style="border: none;">IFrame not supported</iframe>')

df_casualties_with_time_location = df_casualties_with_time_location[(df_casualties_with_time_location.LONGITUDE.between(-74.259, -73.59) & (df_casualties_with_time_location.LATITUDE.between(40.49, 40.93)))]

Here, we ignore 7.51 % of datapoints from those unspecified location datapoints in order to visualize spacial distribution of cyclists crashes in new york city below.

The plot below enable the adjustable visualzation of yearly crashes in five boroughs of NYC. Note: It is recommended to interact with plots below online by clicking buttom left button due to performace reason

boroughs = alt.topo_feature('https://data.cityofnewyork.us/api/geospatial/tqmj-j8zm?method=export&format=GeoJSON', 'boroughs')
background = alt.Chart(boroughs).mark_geoshape(
    fill='lightgray',
    stroke='white',
    strokeWidth=2
).encode(
    color=alt.value('#eee'),
).properties(
    width=1000,
    height=800
)


points = alt.Chart(df_casualties_with_time_location[df_casualties_with_time_location.location == True]).mark_circle(
    size=10,
    color='steelblue'
).encode(
    longitude='LONGITUDE:Q',
    latitude='LATITUDE:Q',
    color='BOROUGH:N',
    tooltip=['CRASH TIME', 'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED', 'VEHICLE TYPE CODE 1',
       'VEHICLE TYPE CODE 2', 'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4',
       'VEHICLE TYPE CODE 5', 'CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2',
       'CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4',
       'CONTRIBUTING FACTOR VEHICLE 5']


# A Slider filter
year_slider = alt.binding_range(min=2012, max=2021, step=1)
slider_selection = alt.selection_single(bind=year_slider, fields=['year'], name="Yearly Crashes")

filter_year = points.add_selection(
    slider_selection
).transform_filter(
    slider_selection
).properties(title="Slider Filtering")

(background + filter_year)# & bars

HTML('<iframe src="https://datapane.com/u/jason0/reports/d7dXOZ7/plot11/embed/" width="100%" height="540px" style="border: none;">IFrame not supported</iframe>')

boroughs = alt.topo_feature('https://data.cityofnewyork.us/api/geospatial/tqmj-j8zm?method=export&format=GeoJSON', 'boroughs')
background = alt.Chart(boroughs).mark_geoshape(
    fill='lightgray',
    stroke='white',
    strokeWidth=2
).encode(
    color=alt.value('#eee'),
).properties(
    width=1000,
    height=800
)

points = alt.Chart(df_casualties_with_time_location[df_casualties_with_time_location.location == True]).mark_circle(
    size=20,
    color='steelblue'
).encode(
    longitude='LONGITUDE:Q',
    latitude='LATITUDE:Q',
    color='BOROUGH:N',
    tooltip=['CRASH TIME', 'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED', 'VEHICLE TYPE CODE 1',
       'VEHICLE TYPE CODE 2', 'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4',
       'VEHICLE TYPE CODE 5', 'CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2',
       'CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4',
       'CONTRIBUTING FACTOR VEHICLE 5']
).transform_filter(
    (alt.datum['NUMBER OF CYCLIST KILLED'] != 0)
)
    
# A Slider filter
year_slider = alt.binding_range(min=2012, max=2021, step=1)
slider_selection = alt.selection_single(bind=year_slider, fields=['year'], name="Yearly Crashes")


filter_year = points.add_selection(
    slider_selection
).transform_filter(
    slider_selection
).properties(title="Slider Filtering")
plot12 = background + filter_year
dp.Report(
    dp.Plot(plot12),
).upload(name="Plot12")

HTML('<iframe src="https://datapane.com/u/jason0/reports/MA1pPYk/plot12/embed/" width="100%" height="540px" style="border: none;">IFrame not supported</iframe>')

Vehicles that normally collide with Bicycles

df_casualties_with_time.groupby(by=['BOROUGH', 'CONTRIBUTING FACTOR VEHICLE 1', 'year']).sum().sort_values(by=['NUMBER OF CYCLIST INJURED'], ascending=False).loc['BROOKLYN']

In these charts below, we can see what kinds of vehicles are responsible for injuries or deaths in most crash events. Noticeably, it also have shown that vehicles casuing more lethal occurences are not necessarily the same as dangerous(cause more injuries than death) ones. Based on different borough, more bulky or speedy vehicles are actually triggering more deaths.

Specifically, among these vehicles, Passenger Vehicle, Sedan, Station Wagon/SUV are main culprits causing lots of casualties. Additionally, crashes with bus and box truck give rise to higher death deaths.

df_casualties_with_time.head()

brooklyn = alt.Chart(
    df_casualties_with_time,
).mark_bar().encode(
    x=alt.X('VEHICLE TYPE CODE 1:N', sort='-y'),
    y=alt.Y('sum(NUMBER OF CYCLIST INJURED):Q'),
    color=alt.Color('sum(NUMBER OF CYCLIST KILLED):Q')
).transform_filter(
    (alt.datum.BOROUGH == 'BROOKLYN')# & (alt.datum.year < 2019)
).properties(
    title='Brookyn')

bronx = alt.Chart(
    df_casualties_with_time,
).mark_bar().encode(
    x=alt.X('VEHICLE TYPE CODE 1:N', sort='-y'),
    y=alt.Y('sum(NUMBER OF CYCLIST INJURED):Q'),
    color=alt.Color('sum(NUMBER OF CYCLIST KILLED):Q')
).transform_filter(
    (alt.datum.BOROUGH == 'BRONX')# & (alt.datum.year < 2019)
).properties(
    title='Bronx')

manhattan = alt.Chart(
    df_casualties_with_time,
).mark_bar().encode(
    x=alt.X('VEHICLE TYPE CODE 1:N', sort='-y'),
    y=alt.Y('sum(NUMBER OF CYCLIST INJURED):Q'),
    color=alt.Color('sum(NUMBER OF CYCLIST KILLED):Q')
).transform_filter(
    (alt.datum.BOROUGH == 'MANHATTAN')# & (alt.datum.year < 2019)
).properties(
    title='Manhattan')

queens = alt.Chart(
    df_casualties_with_time,
).mark_bar().encode(
    x=alt.X('VEHICLE TYPE CODE 1:N', sort='-y'),
    y=alt.Y('sum(NUMBER OF CYCLIST INJURED):Q'),
    color=alt.Color('sum(NUMBER OF CYCLIST KILLED):Q')
).transform_filter(
    (alt.datum.BOROUGH == 'QUEENS')# & (alt.datum.year < 2019)
).properties(
    title='Queens')

staten = alt.Chart(
    df_casualties_with_time,
).mark_bar().encode(
    x=alt.X('VEHICLE TYPE CODE 1:N', sort='-y'),
    y=alt.Y('sum(NUMBER OF CYCLIST INJURED):Q'),
    color=alt.Color('sum(NUMBER OF CYCLIST KILLED):Q', legend=alt.Legend(title='NUMBER OF CYCLIST KILLED'))
).transform_filter(
    (alt.datum.BOROUGH == 'STATEN ISLAND')# & (alt.datum.year < 2019)
).properties(
    title='Statne Island')


dp.Report(
    dp.Page(
        title='Brooklyn',
        blocks=["### Plot", brooklyn]
    ),
    dp.Page(
        title='Bronx',
        blocks=["### Plot", bronx]
    ),
    dp.Page(
        title='Manhattan',
        blocks=["### Plot", manhattan]
    ),
    dp.Page(
        title='Queens',
        blocks=["### Plot", queens]
    ),
    dp.Page(
        title='Staten',
        blocks=["### Plot", staten]
    ),
).upload(name="Plot13")

HTML('<iframe src="https://datapane.com/u/jason0/reports/Y3YVnM7/plot13/embed/" width="100%" height="540px" style="border: none;">IFrame not supported</iframe>')

We also investigate contributing factors in cylists involved collisions. Common factors include:

Driver Inattention/Distraction
Failure to Yield Right-of-Way
Pedestrian/Bicyclist/Other Pedestrian Error/Confusion
Unfortunately, the majority of contributing factors from collisions data are Unspecified.

brooklyn = alt.Chart(
    df_casualties_with_time,
).mark_bar().encode(
    x=alt.X('CONTRIBUTING FACTOR VEHICLE 1:N', sort='-y'),
    y=alt.Y('sum(NUMBER OF CYCLIST INJURED):Q'),
    color=alt.Color('sum(NUMBER OF CYCLIST KILLED):Q')
).transform_filter(
    (alt.datum.BOROUGH == 'BROOKLYN')# & (alt.datum.year < 2019)
).properties(
    title='Brookyn')

bronx = alt.Chart(
    df_casualties_with_time,
).mark_bar().encode(
    x=alt.X('CONTRIBUTING FACTOR VEHICLE 1:N', sort='-y'),
    y=alt.Y('sum(NUMBER OF CYCLIST INJURED):Q'),
    color=alt.Color('sum(NUMBER OF CYCLIST KILLED):Q')
).transform_filter(
    (alt.datum.BOROUGH == 'BRONX')# & (alt.datum.year < 2019)
).properties(
    title='Bronx')

manhattan = alt.Chart(
    df_casualties_with_time,
).mark_bar().encode(
    x=alt.X('CONTRIBUTING FACTOR VEHICLE 1:N', sort='-y'),
    y=alt.Y('sum(NUMBER OF CYCLIST INJURED):Q'),
    color=alt.Color('sum(NUMBER OF CYCLIST KILLED):Q')
).transform_filter(
    (alt.datum.BOROUGH == 'MANHATTAN')# & (alt.datum.year < 2019)
).properties(
    title='Manhattan')

queens = alt.Chart(
    df_casualties_with_time,
).mark_bar().encode(
    x=alt.X('CONTRIBUTING FACTOR VEHICLE 1:N', sort='-y'),
    y=alt.Y('sum(NUMBER OF CYCLIST INJURED):Q'),
    color=alt.Color('sum(NUMBER OF CYCLIST KILLED):Q')
).transform_filter(
    (alt.datum.BOROUGH == 'QUEENS')# & (alt.datum.year < 2019)
).properties(
    title='Queens')

staten = alt.Chart(
    df_casualties_with_time,
).mark_bar().encode(
    x=alt.X('CONTRIBUTING FACTOR VEHICLE 1:N', sort='-y'),
    y=alt.Y('sum(NUMBER OF CYCLIST INJURED):Q'),
    color=alt.Color('sum(NUMBER OF CYCLIST KILLED):Q', legend=alt.Legend(title='NUMBER OF CYCLIST KILLED'))
).transform_filter(
    (alt.datum.BOROUGH == 'STATEN ISLAND')# & (alt.datum.year < 2019)
).properties(
    title='Statne Island')


dp.Report(
    dp.Page(
        title='Brooklyn',
        blocks=["### Plot", brooklyn]
    ),
    dp.Page(
        title='Bronx',
        blocks=["### Plot", bronx]
    ),
    dp.Page(
        title='Manhattan',
        blocks=["### Plot", manhattan]
    ),
    dp.Page(
        title='Queens',
        blocks=["### Plot", queens]
    ),
    dp.Page(
        title='Staten',
        blocks=["### Plot", staten]
    ),
).upload(name="Plot14")

HTML('<iframe src="https://datapane.com/u/jason0/reports/E7ylGg3/plot14/embed/" width="100%" height="540px" style="border: none;">IFrame not supported</iframe>')

What factors contribute to Brooklyn's uncontested role as the most lethal area to bike.

We currently have already known what sorts of vehicles and main contributing factors were held accountable for vehemently high injuries/fatalities.

There are many aspects to consider reasons why brooklyn has much higher injuries/fatalities.

street traffic violation record
street design(road type)
traffic sign design
etc..

In this branch, we focus on traffic violation record.

How traffic violation records differs from each boroughs

df_summons = pd.read_csv('NYPD_B_Summons__Historic_.csv')
df_summons.head()

In this dataset, we have only violation records in 2020, so we only explore this dataset in 2020 from the perspective of moving violation summons.

df_violatecode = pd.read_excel('violationcodes.xlsx')
df_violatecode.head()

df_violatecode['ADJ Code'] = df_violatecode['ADJ Code'].astype('str')
# Check if ADJ Code column has any int type value left
print(len([c for c in df_violatecode['ADJ Code'] if type(c) == int]))
# Check if VIOLATION_CODE column in df_summons still are str dtypes
print(len([c for c in df_summons if type(c) == int]))

0
0

viocode_dict = df_violatecode.set_index('ADJ Code').DESCRIPTION.to_dict()
# Fix data value error
viocode_dict['37524A'] = viocode_dict['37524AB']
viocode_dict['408E5'] = viocode_dict['408E5  ']
del viocode_dict['408E5  ']
# Create a new column in df_summons with created dictionary.
df_summons['DESCRIPTION'] = df_summons.VIOLATION_CODE.map(viocode_dict)
df_summons['can be issued to a bicyclist?'] = df_summons.VIOLATION_CODE.map(df_violatecode.set_index('ADJ Code')['can be issued to a bicyclist?'].to_dict())
display(df_summons.shape)

(510293, 15)

By and large, Brooklyn has highest violation record from both CAR/SUV and Truck/BUS which also account for annual death of cyclists in other four boroughs. PS: There is no such borough called New York, it is a problem from datasets. And those locations actually could came from anywhere in New York city.

plt.figure(figsize=(20, 10))
sns.set_theme(style="whitegrid")
g = sns.countplot(
    data=df_summons,
    y='CITY_NM', hue='VEH_CATEGORY',
)

Interactive bar plots below lists top 20 traffic violation conducts from highest to lowest, viewer can also play around radio buttons to see how changes vary depends on your chosen conditions. In conlusion, although Queens had highest over speeding viloations, brooklyn on the other hand had higher number of records from a variety of violation conducts such as speeding/disobeyed traffic device/operate vehicle while using electronic device/failt to yield cyclists and pedastrian that can explain why the contributing factor Driver Inattention/Distraction of crashes in brooklyn dominate other factors and is significant all over other boroughs.

df_summons_top20_tickets = df_summons.loc[df_summons.VIOLATION_CODE.isin(df_summons.VIOLATION_CODE.value_counts()[:20].index)][['CITY_NM', 'DESCRIPTION']]
df_summons_top20_tickets.head()

alt.data_transformers.enable('default')
alt.data_transformers.disable_max_rows()

bars = alt.Chart(
    df_summons_top20_tickets
).mark_bar().encode(
    x=alt.X('DESCRIPTION:N', sort='-y'),
    y=alt.Y('count():Q'),
    color=alt.Color('count():Q'),
    tooltip=['DESCRIPTION']
)

boroughs = ['BRONX', 'MANHATTAN', 'BROOKLYN', 'QUEENS']#, 'STATEN ISLAND', 'NEW YORK']

borough_radio = alt.binding_radio(options=boroughs)

borough_select = alt.selection_single(fields=['CITY_NM'], bind=borough_radio, name="Boroughs", init={'CITY_NM': 'BRONX'})

filter_borough = bars.add_selection(
    borough_select
).transform_filter(
    borough_select
).properties(title='Violation Code Bar change by Boroughs')

dp.Report(
    dp.Plot(filter_borough),
).upload(name="Plot17")

HTML('<iframe src="https://datapane.com/u/jason0/reports/87NGoVk/plot17/embed/" width="100%" height="540px" style="border: none;">IFrame not supported</iframe>')

From the perspective of moving violation from streets in NYC, it is not so hard to imagine that Brooklyn with such high numbers of violations consistently reflect its hostile environment for cyclists and pedastrians every year. Particularly, moving violations of bicycles was not higher than Manhattan and Queens, however, moving violations of CAR/SUVs in Brooklyn is the highest among boroughs.

Altogether we can safely say Brooklyn is a relatively dangerous place to ride a bike, and we also get a slight idea why brookln always pose potential threats to cyclists. With all that being said, in this analysis we still have not found other factors contribute to Brooklyn's high crashes rate such as street design, traffic sign maintaince, etc..

It wil be intersting to continue dig into this topics, all the insights gotten from analysis would have a potential application for effective city planing. Please wait for future analysis.

	CRASH DATE	CRASH TIME	BOROUGH	ZIP CODE	LATITUDE	LONGITUDE	LOCATION	ON STREET NAME	CROSS STREET NAME	OFF STREET NAME	...	CONTRIBUTING FACTOR VEHICLE 2	CONTRIBUTING FACTOR VEHICLE 3	CONTRIBUTING FACTOR VEHICLE 4	CONTRIBUTING FACTOR VEHICLE 5	COLLISION_ID	VEHICLE TYPE CODE 1	VEHICLE TYPE CODE 2	VEHICLE TYPE CODE 3	VEHICLE TYPE CODE 4	VEHICLE TYPE CODE 5
0	04/14/2021	5:32	NaN	NaN	NaN	NaN	NaN	BRONX WHITESTONE BRIDGE	NaN	NaN	...	Unspecified	NaN	NaN	NaN	4407480	Sedan	Sedan	NaN	NaN	NaN
1	04/13/2021	21:35	BROOKLYN	11217	40.68358	-73.97617	(40.68358, -73.97617)	NaN	NaN	620 ATLANTIC AVENUE	...	NaN	NaN	NaN	NaN	4407147	Sedan	NaN	NaN	NaN	NaN
2	04/15/2021	16:15	NaN	NaN	NaN	NaN	NaN	HUTCHINSON RIVER PARKWAY	NaN	NaN	...	NaN	NaN	NaN	NaN	4407665	Station Wagon/Sport Utility Vehicle	NaN	NaN	NaN	NaN
3	04/13/2021	16:00	BROOKLYN	11222	NaN	NaN	NaN	VANDERVORT AVENUE	ANTHONY STREET	NaN	...	Unspecified	NaN	NaN	NaN	4407811	Sedan	NaN	NaN	NaN	NaN
4	04/12/2021	8:25	NaN	NaN	0.00000	0.00000	(0.0, 0.0)	EDSON AVENUE	NaN	NaN	...	Unspecified	NaN	NaN	NaN	4406885	Station Wagon/Sport Utility Vehicle	Sedan	NaN	NaN	NaN

	CRASH DATE	CRASH TIME	BOROUGH	ZIP CODE	LATITUDE	LONGITUDE	LOCATION	ON STREET NAME	CROSS STREET NAME	OFF STREET NAME	...	CONTRIBUTING FACTOR VEHICLE 2	CONTRIBUTING FACTOR VEHICLE 3	CONTRIBUTING FACTOR VEHICLE 4	CONTRIBUTING FACTOR VEHICLE 5	COLLISION_ID	VEHICLE TYPE CODE 1	VEHICLE TYPE CODE 2	VEHICLE TYPE CODE 3	VEHICLE TYPE CODE 4	VEHICLE TYPE CODE 5
0	2021-04-14	5:32	NaN	NaN	NaN	NaN	NaN	BRONX WHITESTONE BRIDGE	NaN	NaN	...	Unspecified	NaN	NaN	NaN	4407480	Sedan	Sedan	NaN	NaN	NaN
1	2021-04-13	21:35	BROOKLYN	11217	40.68358	-73.97617	(40.68358, -73.97617)	NaN	NaN	620 ATLANTIC AVENUE	...	NaN	NaN	NaN	NaN	4407147	Sedan	NaN	NaN	NaN	NaN
2	2021-04-15	16:15	NaN	NaN	NaN	NaN	NaN	HUTCHINSON RIVER PARKWAY	NaN	NaN	...	NaN	NaN	NaN	NaN	4407665	Station Wagon/Sport Utility Vehicle	NaN	NaN	NaN	NaN
3	2021-04-13	16:00	BROOKLYN	11222	NaN	NaN	NaN	VANDERVORT AVENUE	ANTHONY STREET	NaN	...	Unspecified	NaN	NaN	NaN	4407811	Sedan	NaN	NaN	NaN	NaN
4	2021-04-12	8:25	NaN	NaN	0.00000	0.00000	(0.0, 0.0)	EDSON AVENUE	NaN	NaN	...	Unspecified	NaN	NaN	NaN	4406885	Station Wagon/Sport Utility Vehicle	Sedan	NaN	NaN	NaN

	CRASH DATE	CRASH TIME	BOROUGH	ZIP CODE	LATITUDE	LONGITUDE	LOCATION	ON STREET NAME	CROSS STREET NAME	OFF STREET NAME	...	CONTRIBUTING FACTOR VEHICLE 3	CONTRIBUTING FACTOR VEHICLE 4	CONTRIBUTING FACTOR VEHICLE 5	COLLISION_ID	VEHICLE TYPE CODE 1	VEHICLE TYPE CODE 2	VEHICLE TYPE CODE 3	VEHICLE TYPE CODE 4	VEHICLE TYPE CODE 5	year
0	2021-04-14	5:32	NaN	NaN	NaN	NaN	NaN	BRONX WHITESTONE BRIDGE	NaN	NaN	...	NaN	NaN	NaN	4407480	Sedan	Sedan	NaN	NaN	NaN	2021
1	2021-04-13	21:35	BROOKLYN	11217	40.68358	-73.97617	(40.68358, -73.97617)	NaN	NaN	620 ATLANTIC AVENUE	...	NaN	NaN	NaN	4407147	Sedan	NaN	NaN	NaN	NaN	2021
2	2021-04-15	16:15	NaN	NaN	NaN	NaN	NaN	HUTCHINSON RIVER PARKWAY	NaN	NaN	...	NaN	NaN	NaN	4407665	Station Wagon/Sport Utility Vehicle	NaN	NaN	NaN	NaN	2021
3	2021-04-13	16:00	BROOKLYN	11222	NaN	NaN	NaN	VANDERVORT AVENUE	ANTHONY STREET	NaN	...	NaN	NaN	NaN	4407811	Sedan	NaN	NaN	NaN	NaN	2021
4	2021-04-12	8:25	NaN	NaN	0.00000	0.00000	(0.0, 0.0)	EDSON AVENUE	NaN	NaN	...	NaN	NaN	NaN	4406885	Station Wagon/Sport Utility Vehicle	Sedan	NaN	NaN	NaN	2021

	id	counts	date	status	site
0	0	41.0	08/31/2012 12:00:00 AM	4.0	100005020
1	1	52.0	08/31/2012 12:15:00 AM	4.0	100005020
2	2	38.0	08/31/2012 12:30:00 AM	4.0	100005020
3	3	36.0	08/31/2012 12:45:00 AM	4.0	100005020
4	4	40.0	08/31/2012 01:00:00 AM	4.0	100005020

	id	name	latitude	longitude	domain	site	timezone	interval	counter
0	0	Manhattan Bridge 2012 Test Bike Counter	40.699810	-73.985890	New York City DOT	100005020	(UTC-05:00) US/Eastern;DST	15	NaN
1	5	Ed Koch Queensboro Bridge Shared Path	40.751038	-73.940820	New York City DOT	100009428	(UTC-05:00) US/Eastern;DST	15	Y2H19111445
2	10	1st Avenue - 26th St N - Interference testing	40.738830	-73.977165	New York City DOT	100010020	(UTC-05:00) US/Eastern;DST	15	Y2H18044984
3	14	Manhattan Bridge Interference Calibration 2018...	0.000000	0.000000	New York City DOT	100048744	(UTC-05:00) US/Eastern;DST	15	Y2H13074109
4	20	Columbus Ave at 86th St.	40.787745	-73.975021	New York City DOT	100057320	(UTC-05:00) US/Eastern;DST	15	Y2H18055356

	id	counts	date	status	site	year
0	0	41.0	2012-08-31 00:00:00	4.0	100005020	2012
1	1	52.0	2012-08-31 00:15:00	4.0	100005020	2012
2	2	38.0	2012-08-31 00:30:00	4.0	100005020	2012
3	3	36.0	2012-08-31 00:45:00	4.0	100005020	2012
4	4	40.0	2012-08-31 01:00:00	4.0	100005020	2012

	NUMBER OF CYCLIST INJURED	NUMBER OF CYCLIST KILLED	NUMBER OF PERSONS INJURED	NUMBER OF PERSONS KILLED	NUMBER OF PEDESTRIANS INJURED	NUMBER OF PEDESTRIANS KILLED	NUMBER OF MOTORIST INJURED	NUMBER OF MOTORIST KILLED	year	NUMBER of TOTAL CASUALTIES	NUMBER of TOTAL CYCLISTS CASUALTIES	NUMBER of TOTAL PEDESTRIANS CASUALTIES	NUMBER of TOTAL MOTORIST CASUALTIES
0	2210	6	27453.0	137.0	5906	72	19331	59	2012	27590.0	2216	5978	19390
1	4075	11	55124.0	297.0	11988	176	39060	110	2013	55421.0	4086	12164	39170
2	4000	20	51223.0	262.0	11036	133	36176	109	2014	51485.0	4020	11169	36285
3	4281	15	51358.0	243.0	10084	133	36992	95	2015	51601.0	4296	10217	37087
4	4975	18	60317.0	246.0	11090	149	44010	72	2016	60563.0	4993	11239	44082

	counts	year
0	1611856.0	2012
1	315312.0	2013
2	8712098.0	2014
3	13006008.0	2015
4	15169726.0	2016

	NUMBER OF CYCLIST INJURED	NUMBER OF CYCLIST KILLED	NUMBER OF PERSONS INJURED	NUMBER OF PERSONS KILLED	NUMBER OF PEDESTRIANS INJURED	NUMBER OF PEDESTRIANS KILLED	NUMBER OF MOTORIST INJURED	NUMBER OF MOTORIST KILLED
BOROUGH
BRONX	3497	17	54317.000000	199.000000	12532	110	38054	71
BROOKLYN	13306	52	119906.000000	496.000000	25816	271	80371	167
MANHATTAN	9751	30	53915.000000	264.000000	17823	194	26218	39
QUEENS	6657	28	92284.000000	440.000000	17810	242	67495	169
STATEN ISLAND	393	3	13877.000000	78.000000	2030	33	11435	41

	NUMBER OF CYCLIST INJURED	CRASH DATE	BOROUGH	Total Casualties of Cyclists
0	2	2012-07-01	BRONX	2
1	1	2012-07-02	BRONX	1
2	2	2012-07-03	BRONX	2
3	1	2012-07-04	BRONX	1
4	0	2012-07-05	BRONX	0

	CRASH DATE	BOROUGH	NUMBER OF CYCLIST INJURED	CONTRIBUTING FACTOR VEHICLE 1	CONTRIBUTING FACTOR VEHICLE 2	CONTRIBUTING FACTOR VEHICLE 3	CONTRIBUTING FACTOR VEHICLE 4	CONTRIBUTING FACTOR VEHICLE 5	COLLISION_ID	VEHICLE TYPE CODE 1	VEHICLE TYPE CODE 2	VEHICLE TYPE CODE 3	VEHICLE TYPE CODE 4	VEHICLE TYPE CODE 5	year	ZIP CODE
52	2021-04-16 11:00:00	QUEENS	1	Turning Improperly	Unspecified	NaN	NaN	NaN	4407792	Station Wagon/Sport Utility Vehicle	Bike	NaN	NaN	NaN	2021	11368
90	2021-04-14 00:00:00	NaN	1	Failure to Yield Right-of-Way	Unspecified	NaN	NaN	NaN	4407649	Station Wagon/Sport Utility Vehicle	Bike	NaN	NaN	NaN	2021	NaN
139	2021-04-13 17:55:00	BRONX	1	Pedestrian/Bicyclist/Other Pedestrian Error/Co...	Unspecified	NaN	NaN	NaN	4407789	Station Wagon/Sport Utility Vehicle	Bike	NaN	NaN	NaN	2021	10452
145	2021-04-14 19:45:00	BROOKLYN	1	Driver Inattention/Distraction	Driver Inattention/Distraction	NaN	NaN	NaN	4407414	Sedan	Bike	NaN	NaN	NaN	2021	11201
178	2021-04-16 00:30:00	QUEENS	1	Unsafe Speed	Unspecified	Unspecified	NaN	NaN	4407714	Sedan	Bike	NaN	NaN	NaN	2021	11369

		NUMBER OF CYCLIST INJURED	NUMBER OF CYCLIST KILLED	COLLISION_ID
CONTRIBUTING FACTOR VEHICLE 1	year
Unspecified	2013	986	3	160370315
	2014	755	2	795123211
	2015	686	1	2224404703
	2012	558	1	95388300
Driver Inattention/Distraction	2020	462	2	1981703348
...	...	...	...	...
Traffic Control Device Improper/Non-Working	2019	1	0	4174052
Traffic Control Device Improper/Non-Working	2018	1	0	3933370
Tire Failure/Inadequate	2021	1	0	4446831
Tire Failure/Inadequate	2019	1	0	4162201
Following Too Closely	2015	0	1	3257329

	EVNT_KEY	VIOLATION_DATE	VIOLATION_TIME	CHG_LAW_CD	VIOLATION_CODE	VEH_CATEGORY	CITY_NM	RPT_OWNING_CMD	X_COORD_CD	Y_COORD_CD	Latitude	Longitude	Location Point
0	212468848	04/29/2020	02:54:13	VTL	4021	CAR/SUV	BRONX	41	1011681	234918	40.811437	-73.900907	POINT (-73.90090718699997 40.81143709100007)
1	218070566	09/19/2020	13:45:12	VTL	1110AB	BIKE	MANHATTAN	28	997130	231440	40.801924	-73.953479	POINT (-73.95347921299998 40.801924136000025)
2	212905075	05/11/2020	07:11:59	VTL	1225D	CAR/SUV	MANHATTAN	24	993243	228137	40.792863	-73.967523	POINT (-73.96752338099998 40.79286310800006)
3	217338795	09/02/2020	07:53:00	VTL	1229C3A	TRUCK/BUS	BROOKLYN	69	1012161	178176	40.655693	-73.899409	POINT (-73.89940857299996 40.65569287400007)
4	218463147	09/29/2020	08:10:22	VTL	1225C2A	CAR/SUV	QUEENS	103	1041717	197008	40.707240	-73.792727	POINT (-73.79272672599996 40.707239816000026)

	LAW_TITLE	ADJ Code	Law Code printed on ticket	DESCRIPTION	can be issued to a bicyclist?
0	NYCTRR	413	4-13	OPERATE TRUCK OFF AUTHORIZED RTE - NYC	N
1	VTL	416	416	OPERATE VEHICLE WITH DEALER PLATES MORE THAN 5...	N
2	VTL	601	601	LEAVE SCENE OF INCIDENT - DOMESTIC ANIMAL STRU...	N
3	VTL	1102	1102	FAILED TO COMPLY WITH LAWFUL ORDER	N
4	VTL	1117	1117	FAILED TO STOP AT MALFUNCTIONING TRAFFIC LIGHT	N

Sum of Casualties by Bourough and Years(2012 to 2021)
		NUMBER OF CYCLIST INJURED	NUMBER OF CYCLIST KILLED	NUMBER OF PERSONS INJURED	NUMBER OF PERSONS KILLED	NUMBER OF PEDESTRIANS INJURED	NUMBER OF PEDESTRIANS KILLED	NUMBER OF MOTORIST INJURED	NUMBER OF MOTORIST KILLED
BOROUGH	year
BRONX	2012	155	1	2878.000000	17.000000	753	8	1970	8
	2013	295	1	6259.000000	32.000000	1672	25	4292	6
	2014	356	1	5886.000000	19.000000	1505	10	4025	8
	2015	363	3	5626.000000	26.000000	1372	11	3891	12
	2016	364	1	6050.000000	19.000000	1345	14	4298	5
	2017	325	1	5845.000000	17.000000	1354	10	4167	6
	2018	356	1	6288.000000	21.000000	1444	10	4482	9
	2019	392	0	6528.000000	11.000000	1410	9	4726	2
	2020	615	5	5214.000000	23.000000	987	10	3612	8
	2021	276	3	3743.000000	14.000000	690	3	2591	7
BROOKLYN	2012	879	2	7526.000000	33.000000	1836	19	4805	12
	2013	1539	4	15411.000000	67.000000	3641	30	10231	33
	2014	1409	4	14245.000000	67.000000	3348	42	9487	21
	2015	1513	4	13706.000000	67.000000	3028	46	9164	17
	2016	1452	6	13452.000000	37.000000	2917	20	9035	7
	2017	1300	7	12047.000000	40.000000	2663	20	8089	14
	2018	1368	2	13197.000000	39.000000	2785	23	9034	14
	2019	1397	13	13008.000000	59.000000	2700	33	8911	13
	2020	1553	10	9696.000000	50.000000	1780	17	6363	23
	2021	896	0	7618.000000	37.000000	1118	21	5252	13
MANHATTAN	2012	554	1	3868.000000	21.000000	1390	17	1924	3
	2013	1131	2	7585.000000	39.000000	2811	29	3643	8
	2014	1156	5	6909.000000	35.000000	2586	25	3167	5
	2015	1192	1	6531.000000	22.000000	2288	18	3051	3
	2016	1109	3	6236.000000	34.000000	2076	30	3083	0
	2017	974	8	5648.000000	33.000000	1770	21	2908	4
	2018	1005	2	5437.000000	19.000000	1784	12	2650	5
	2019	1032	2	5242.000000	24.000000	1565	20	2644	2
	2020	958	5	3574.000000	22.000000	880	10	1736	7
	2021	640	1	2885.000000	15.000000	673	12	1412	2
QUEENS	2012	402	2	5506.000000	28.000000	1172	15	3932	11
	2013	721	1	10686.000000	71.000000	2327	45	7638	25
	2014	714	6	10031.000000	52.000000	2192	26	7122	20
	2015	764	6	10423.000000	51.000000	2004	25	7655	20
	2016	691	1	10928.000000	31.000000	2080	19	8081	10
	2017	703	4	10103.000000	40.000000	1923	23	7471	13
	2018	625	1	10818.000000	53.000000	2016	31	8183	21
	2019	721	5	11095.000000	41.000000	2034	26	8340	10
	2020	858	1	7270.000000	46.000000	1288	20	5124	25