import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 12)

# Load a csv directly into a dataframe
# parse_dates parameter converts the DATE and TIME columns into DATETIME format, resulting in a combined column called DATE_TIME
df = pd.read_csv("c:\NYPD_Motor_Vehicle_Collisions.csv",parse_dates=[['DATE', 'TIME']])

# Remove spaces from column names
cols = df.columns
cols = cols.map(lambda x: x.replace(' ', '_') if isinstance(x, (str, unicode)) else x)
df.columns = cols

# Delete the columns we don't care about
df = df.drop(['LATITUDE', 'LONGITUDE', 'LOCATION', 'UNIQUE_KEY','ON_STREET_NAME','CROSS_STREET_NAME','OFF_STREET_NAME'], axis=1)

# Lets do some aggregation
# Find total by year
df.groupby(df.DATE_TIME.dt.year).sum()

# Now we will look at data for a specific year
# Find total number of persons injured and killed by month in 2014
df[df.DATE_TIME.dt.year == 2014].groupby(df.DATE_TIME.dt.month).sum()[['NUMBER_OF_PERSONS_INJURED','NUMBER_OF_PERSONS_KILLED']]

# Sort the data in descending order on NUMBER_OF_PERSONS_KILLED by zip code
df.groupby(df.ZIP_CODE).sum().sort(['NUMBER_OF_PERSONS_KILLED'],ascending=0)

# Looks like Zip Code 11236 is the most dangerous

# Graph number of pedestrians/motorist killed by year
df.groupby(df.DATE_TIME.dt.year).sum()[['NUMBER_OF_PEDESTRIANS_KILLED','NUMBER_OF_MOTORIST_KILLED']].plot(kind='bar')

# 2013 was the worst year for pedestrian deaths.

<matplotlib.axes._subplots.AxesSubplot at 0x9fa47160>

# Lets look at year 2014 and see how the deaths varied by Borough
df.groupby(df.BOROUGH).sum()[['NUMBER_OF_PEDESTRIANS_KILLED','NUMBER_OF_MOTORIST_KILLED']].plot(kind='bar')

# As we can see, most number of pedestrians died in Brooklyn than any other. 
# Staten Island had the least number of deaths for both pedestrians and motorists. Surprised?

<matplotlib.axes._subplots.AxesSubplot at 0x94532940>

# How did the deaths of pedestrians vary by year in Manhattan?
df[df.BOROUGH == 'MANHATTAN'].groupby(df.DATE_TIME.dt.year).sum()[['NUMBER_OF_PEDESTRIANS_KILLED']].plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0x941e0668>

# Distribution of deaths by borough which have VEHICLE_TYPE_CODE_1 as passenger vehicle
df[df.VEHICLE_TYPE_CODE_1 == 'PASSENGER VEHICLE'].groupby(df.BOROUGH).sum()[['NUMBER_OF_PERSONS_KILLED']].plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0x86b6f2b0>

	ZIP_CODE	NUMBER_OF_PERSONS_INJURED	NUMBER_OF_PERSONS_KILLED	NUMBER_OF_PEDESTRIANS_INJURED	NUMBER_OF_PEDESTRIANS_KILLED	NUMBER_OF_CYCLIST_INJURED	NUMBER_OF_CYCLIST_KILLED	NUMBER_OF_MOTORIST_INJURED	NUMBER_OF_MOTORIST_KILLED
2012	836355612	27445	137	5907	72	2210	6	19328	59
2013	1682824973	55112	297	11984	176	4074	11	39054	110
2014	1688700297	51198	262	11027	133	3997	20	36171	109
2015	759746044	20931	103	4284	53	1319	4	15328	46

	NUMBER_OF_PERSONS_INJURED	NUMBER_OF_PERSONS_KILLED
1	3920	23
2	3106	13
3	3853	18
4	4048	22
5	4658	26
6	4985	22
7	4603	29
8	4476	22
9	4464	28
10	4718	20
11	4242	26
12	4125	13

	ZIP_CODE	NUMBER_OF_PERSONS_INJURED	NUMBER_OF_PERSONS_KILLED	NUMBER_OF_PEDESTRIANS_INJURED	NUMBER_OF_PEDESTRIANS_KILLED	NUMBER_OF_CYCLIST_INJURED	NUMBER_OF_CYCLIST_KILLED	NUMBER_OF_MOTORIST_INJURED	NUMBER_OF_MOTORIST_KILLED
ZIP_CODE
11236	56123820	1945	18	304	7	67	0	1574	11
11434	52722174	1448	12	180	7	43	0	1225	5
11203	61818154	2311	11	404	4	123	0	1784	7
10025	30445925	602	10	227	9	91	1	284	0
11206	49127104	1432	10	363	6	255	1	814	3
10022	66987048	834	9	351	9	125	0	358	0
...	...	...	...	...	...	...	...	...	...
10307	4225870	90	0	11	0	0	0	79	0
10282	400998	6	0	6	0	0	0	0	0
10281	236463	3	0	1	0	2	0	0	0
10280	894360	14	0	3	0	6	0	5	0
10111	111221	0	0	0	0	0	0	0	0
11697	713517	19	0	1	0	6	0	12	0

Analyzing NYC traffic data using Pandas

Join the Conversation

Leave a comment

Cancel reply