Data set transformation of FDNY’s Fire Incident Dispatch data using Pandas (Python)

This quick data set transformation was done for one of my current analytics projects. We decided to study and run models on FDNY’s public fire incident dispatch data from NYC Open Data. It’s an on-going project and the results will be posted at the end of the semester.

The goal of this transformation is to produce a data set that has the following filters and conditions:

  • Remove all row with empty values
  • Filter out the year 2017
  • Split out date strings to convert to Python readable date formats
  • Filter out “Medical Emergencies” and “Medical MFAs” from Incident_Classification_Group
  • Remove insignificant columns
  • Create drill down variables for some of the columns (Incident_Datetime, Incident_Borough, etc.)
  • Replace Police Precinct category with a count of the number of police precincts in each zip code

The end result decreased rows from 1.8M to 800K+.

The transformation was done on Python. The code can be seen below.

Original data set: https://data.cityofnewyork.us/Public-Safety/Fire-Incident-Dispatch-Data/8m42-w767/data

Initial Transformation:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import csv
import pandas as pd
import numpy as np
from datetime import date, time, datetime

rowlist = []
totallist = []
countrows = 0
#PART 1: remove all rows with empty values
# data = pd.read_csv('Fire_Incident_Dispatch_Data.csv', sep = ',', dtype = {'INCIDENT_RESPONSE_SECONDS_QY':np.int64, 'INCIDENT_TRAVEL_TM_SECONDS_QY': np.int64, 'ENGINES_ASSIGNED_QUANTITY': np.int64, 'LADDERS_ASSIGNED_QUANTITY':np.int64,'OTHER_UNITS_ASSIGNED_QUANTITY': np.int64})
# # data.dropna().to_csv('fireincidentdatasetnoblank.csv')
# data = data.dropna()


#PART 2: data set transformation
#get month, year, day, daydate, and time
		#1/01/2013 12:00:37 AM - format
def convert_datetime(csv_datetime):

	DATETIME_SPLIT = csv_datetime.split(' ')
	DATE_SPLIT = DATETIME_SPLIT[0].split('/')
	MONTH = int(DATE_SPLIT[0])
	DAYDATE = int(DATE_SPLIT[1])
	YEAR = int(DATE_SPLIT[2])
	dateformat = date(YEAR,MONTH,DAYDATE)
	WEEKDAY_INDEX = dateformat.weekday()

	#format INCIDENT date
	if WEEKDAY_INDEX == 0:
		WEEKDAY = 'MONDAY'
	elif WEEKDAY_INDEX == 1:
		WEEKDAY = 'TUESDAY'
	elif WEEKDAY_INDEX == 2:
		WEEKDAY = 'WEDNESDAY'
	elif WEEKDAY_INDEX == 3:
		WEEKDAY = 'THURSDAY'
	elif WEEKDAY_INDEX == 4:
		WEEKDAY = 'FRIDAY'
	elif WEEKDAY_INDEX == 5:
		WEEKDAY = 'SATURDAY'
	else:
		WEEKDAY = 'SUNDAY'

	#format time
	TIME_SPLIT = DATETIME_SPLIT[1].split(':')
	HOUR = int(TIME_SPLIT[0])
	MINUTE = int(TIME_SPLIT[1])
	SECOND = int(TIME_SPLIT[2])
	

	if DATETIME_SPLIT[2] == 'AM':
		if HOUR in [12,1,2,3,4,5]:
			QTR_DAY = 'Early Morning'
			if HOUR == 12:
				HOUR_PYTHON = 0
			else:
				HOUR_PYTHON = HOUR
		else:
			QTR_DAY = 'Morning'
			HOUR_PYTHON = HOUR
	else:
		if HOUR in [12,1,2,3,4,5]:
			QTR_DAY = 'Afternoon'
			if HOUR == 12:
				HOUR_PYTHON = HOUR
			else:
				HOUR_PYTHON = HOUR + 12
		else:
			QTR_DAY = 'Evening'
			if HOUR == 12:
				HOUR_PYTHON = HOUR
			else:
				HOUR_PYTHON = HOUR + 12
	timeformat = time(HOUR_PYTHON, MINUTE, SECOND)
	datetimeformat = datetime(YEAR,MONTH,DAYDATE,HOUR_PYTHON,MINUTE,SECOND)
	datetimelist = [YEAR,MONTH,DAYDATE,WEEKDAY,QTR_DAY,str(timeformat),str(datetimeformat)]
	return datetimelist

#process and transform data
with open('fireincidentdatasetnoblank.csv', 'rb') as csv_in:
	myreader = csv.reader(csv_in, delimiter = ',')
	next(myreader) #skips column headers

	for row in myreader:

		INCIDENT_DATETIME = row[2]
		INCIDENT_BOROUGH = row[6]
		ZIPCODE = int(float(row[7]))
		POLICEPRECINCT = int(float(row[8]))
		ALARM_SOURCE_DESCRIPTION_TX = row[13]
		HIGHEST_ALARM_LEVEL = row[15]
		INCIDENT_CLASSIFICATION = row[16]
		INCIDENT_CLASSIFICATION_GROUP = row[17]
		DISPATCH_RESPONSE_SECONDS_QY = row[18]
		FIRST_ASSIGNMENT_DATETIME = row[19]
		FIRST_ACTIVATION_DATETIME = row[20]
		FIRST_ON_SCENE_DATETIME = row[21]
		INCIDENT_CLOSE_DATETIME = row[22]
		VALID_INCIDENT_RSPNS_TIME_INDC = row[24]
		INCIDENT_RESPONSE_SECONDS_QY = row[25]
		INCIDENT_TRAVEL_TM_SECONDS_QY = row[26]
		ENGINES_ASSIGNED_QUANTITY = row[27]
		LADDERS_ASSIGNED_QUANTITY = row[28]
		OTHER_UNITS_ASSIGNED_QUANTITY = row[29]

		#compute for total_resource_qty
		TOTAL_RESOURCE_QTY = int(str(ENGINES_ASSIGNED_QUANTITY)) + int(str(LADDERS_ASSIGNED_QUANTITY)) + int(str(OTHER_UNITS_ASSIGNED_QUANTITY))

		INCIDENT_DATETIME_LIST = convert_datetime(INCIDENT_DATETIME)
		FIRST_ASSIGNMENT_DATETIME_LIST = convert_datetime(FIRST_ASSIGNMENT_DATETIME)
		FIRST_ACTIVATION_DATETIME_LIST = convert_datetime(FIRST_ACTIVATION_DATETIME)
		FIRST_ON_SCENE_DATETIME_LIST = convert_datetime(FIRST_ON_SCENE_DATETIME)
		INCIDENT_CLOSE_DATETIME_LIST = convert_datetime(INCIDENT_CLOSE_DATETIME)
		#datetimelist = [YEAR,MONTH,DAYDATE,WEEKDAY,QTR_DAY,timeformat,datetimeformat]
		#Compute for Incident_Resolution_Sec
		INCIDENT_RESOLUTION_SEC = datetime.strptime(INCIDENT_CLOSE_DATETIME_LIST[6],'%Y-%m-%d %H:%M:%S') - datetime.strptime(INCIDENT_DATETIME_LIST[6],'%Y-%m-%d %H:%M:%S')

		if VALID_INCIDENT_RSPNS_TIME_INDC == "Y":
			if INCIDENT_DATETIME_LIST[0] == "2017":
				pass
			else:
				#INDEPENDENT VARIABLES
				#datetime drilldown
				rowlist.append(INCIDENT_DATETIME_LIST[6])
				# rowlist.append(INCIDENT_DATETIME_LIST[0])
				# rowlist.append(INCIDENT_DATETIME_LIST[1])
				# rowlist.append(INCIDENT_DATETIME_LIST[2])
				# rowlist.append(INCIDENT_DATETIME_LIST[3])
				# rowlist.append(INCIDENT_DATETIME_LIST[4])
				# rowlist.append(INCIDENT_DATETIME_LIST[5])
				# #location drilldown
				# rowlist.append(INCIDENT_BOROUGH)
				rowlist.append(ZIPCODE)
				#policeprecinct
				rowlist.append(POLICEPRECINCT)
				#alarmsource
				rowlist.append(ALARM_SOURCE_DESCRIPTION_TX)
				#alarmlevel
				rowlist.append(HIGHEST_ALARM_LEVEL)
				#incidentclassification drilldown
				# rowlist.append(INCIDENT_CLASSIFICATION_GROUP)
				rowlist.append(INCIDENT_CLASSIFICATION)
				#rowlist.append(FIRST_ASSIGNMENT_DATETIME)
				# rowlist.append(FIRST_ASSIGNMENT_DATETIME_LIST[0])
				# rowlist.append(FIRST_ASSIGNMENT_DATETIME_LIST[1])
				# rowlist.append(FIRST_ASSIGNMENT_DATETIME_LIST[2])
				# rowlist.append(FIRST_ASSIGNMENT_DATETIME_LIST[3])
				# rowlist.append(FIRST_ASSIGNMENT_DATETIME_LIST[4])
				# rowlist.append(FIRST_ASSIGNMENT_DATETIME_LIST[5])
				#rowlist.append(FIRST_ACTIVATION_DATETIME)
				# rowlist.append(FIRST_ACTIVATION_DATETIME_LIST[0])
				# rowlist.append(FIRST_ACTIVATION_DATETIME_LIST[1])
				# rowlist.append(FIRST_ACTIVATION_DATETIME_LIST[2])
				# rowlist.append(FIRST_ACTIVATION_DATETIME_LIST[3])
				# rowlist.append(FIRST_ACTIVATION_DATETIME_LIST[4])
				# rowlist.append(FIRST_ACTIVATION_DATETIME_LIST[5])
				# #rowlist.append(FIRST_ON_SCENE_DATETIME)
				# rowlist.append(FIRST_ON_SCENE_DATETIME_LIST[0])
				# rowlist.append(FIRST_ON_SCENE_DATETIME_LIST[1])
				# rowlist.append(FIRST_ON_SCENE_DATETIME_LIST[2])
				# rowlist.append(FIRST_ON_SCENE_DATETIME_LIST[3])
				# rowlist.append(FIRST_ON_SCENE_DATETIME_LIST[4])
				# rowlist.append(FIRST_ON_SCENE_DATETIME_LIST[5])
				#rowlist.append(INCIDENT_CLOSE_DATETIME)
				#rowlist.append(VALID_INCIDENT_RSPNS_TIME_INDC) #qualifier (should be 'Y')
				#totalresource drilldown
				rowlist.append(TOTAL_RESOURCE_QTY)
				# rowlist.append(ENGINES_ASSIGNED_QUANTITY)
				# rowlist.append(LADDERS_ASSIGNED_QUANTITY)
				# rowlist.append(OTHER_UNITS_ASSIGNED_QUANTITY)

				#DEPENDENT VARIABLES
				rowlist.append(DISPATCH_RESPONSE_SECONDS_QY)		
				# rowlist.append(INCIDENT_RESPONSE_SECONDS_QY)
				rowlist.append(INCIDENT_TRAVEL_TM_SECONDS_QY)
				rowlist.append(int(INCIDENT_RESOLUTION_SEC.total_seconds()))	
				countrows += 1
				print countrows
				# if countrows == 500000:
				# 	print countrows
				# elif countrows == 1000000:
				# 	print countrows
				# elif countrows == 1400000:
				# 	print countrows
				# else:
				# 	pass

				# with open('cleanfireincidentdataset.csv','a') as csvfile:
				# 	for i in range(0,38):
				# 		csvfile.write(str(rowlist[i]))
				# 		if i == 38:
				# 			csvfile.write('\n')
				# 		else:
				# 			csvfile.write(',')
				totallist.append(rowlist)
				rowlist = []
	# print totallist
#labels = ['Incident Date and Time','INCIDENT_YEAR','INCIDENT_MONTH','INCIDENT_DAY','INCIDENT_WEEKDAY','INCIDENT_QTR_OF_DAY','INCIDENT_TIME','INCIDENT_BOROUGH','Zip Code','Count of Police Precincts','Alarm Source Description','Highest Alarm Level','INCIDENT_CLASSIFICATION_GROUP','Incident Classification','FIRST_ASSIGNMENT_YEAR','FIRST_ASSIGNMENT_MONTH','FIRST_ASSIGNMENT_DAY','FIRST_ASSIGNMENT_WEEKDAY','FIRST_ASSIGNMENT_QTR_OF_DAY','FIRST_ASSIGNMENT_TIME','FIRST_ACTIVATION_YEAR','FIRST_ACTIVATION_MONTH','FIRST_ACTIVATION_DAY','FIRST_ACTIVATION_WEEKDAY','FIRST_ACTIVATION_QTR_OF_DAY','FIRST_ACTIVATION_TIME','FIRST_ON_SCENE_YEAR','FIRST_ON_SCENE_MONTH','FIRST_ON_SCENE_DAY','FIRST_ON_SCENE_WEEKDAY','FIRST_ON_SCENE_QTR_OF_DAY','FIRST_ON_SCENE_TIME','Total Quantity of Resources Dispatched', 'ENGINES_ASSIGNED_QUANTITY','LADDERS_ASSIGNED_QUANTITY', 'OTHER_UNITS_ASSIGNED_QUANTITY','Dispatch Response Time (in seconds)','INCIDENT_RESPONSE_SECONDS_QY','Incident Travel Time (in seconds)','Total Resolution Time (in seconds)']
labels = ['Incident Date and Time','Zip Code','Count of Police Precincts','Alarm Source Description','Highest Alarm Level','Incident Classification','Total Quantity of Resources Dispatched', 'Dispatch Response Time (in seconds)','Incident Travel Time (in seconds)','Total Resolution Time (in seconds)']
df = pd.DataFrame.from_records(totallist,columns = labels)	
df.to_csv('cleanfireincident_final.csv')

*Notes: It’s definitely easier and more elegant to create a working CSV file using Pandas’ DataFrame and .to_csv() functions. The structure is automatically created as a whole.

Creation of unique zipcode-police precinct and count list:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
import pandas as pd
import numpy as np

#create data frame from the clean data set and set integer variables
data = pd.read_csv('cleanfireincident_final.csv', sep = ',', dtype = {'Total Quantity of Resources Dispatched':np.int64, 'Dispatch Response Time (in seconds)': np.int64, 'Incident Travel Time (in seconds)': np.int64, 'Total Resolution Time (in seconds)':np.int64})

#create new dataframe only consisting of zip code and police precinct categories (this is not yet the count)
zcu = data[['Zip Code','Count of Police Precincts']].copy()

#create a group key that will be counted later
countzip =  zcu.groupby(['Zip Code','Count of Police Precincts'])
#count unique instances
counting = countzip.aggregate(len)
#insert zcu and count into another dataframe and rename columns
abc = pd.DataFrame(counting.reset_index())
abc.columns = ['Zip Code','Count of Police Precincts','count']
#export to csv
abc.to_csv('uniquezipcodesandcountsofPP.csv')

 

Matching of Zip Code and insertion of real count of police precincts per zip code:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import csv
import pandas as pd
import numpy as np
from datetime import date, time, datetime

#initialize lists
listrow = []
totallist = []
rowlist = []
newcsvlist = []
#same process of opening and reading each column by row
with open('uniquezipcodesandcountsofPP.csv', 'rb') as csv_in:
    myreader2 = csv.reader(csv_in, delimiter = ',')
    next(myreader2) #skips column headers    

    for row in myreader2:
        zipcode2 = row[0]
        pp = row[1]
        count = row[2]
        listrow.append(zipcode2)
        listrow.append(pp)
        listrow.append(count)
       #create a list of lists of zip code-precinct-count
        totallist.append(listrow)
        listrow = []


#same process of opening and reading each column by row
with open('cleanfireincident_final.csv', 'rb') as csv_in:
    myreader = csv.reader(csv_in, delimiter = ',')
    next(myreader) #skips column headers

    for row in myreader:
        dateandtime = row[1]
        zipcode= row[2]
        pp = row[3]
        alarmsource = row[4]
        alarmlevel = row[5]
        classif = row[6]
        qty = row[7]
        dispatchtime = row[8]
        traveltime = row[9]
        resoltime = row[10]
        #print classif

        #matching of zipcode and police precincts
        for ziprow in totallist:
            if zipcode == ziprow[0] and pp == ziprow[1]:
                rowlist.append(dateandtime)
                rowlist.append(zipcode)
               #insert count if match
                rowlist.append(ziprow[2])
                rowlist.append(alarmsource)
                rowlist.append(alarmlevel)
                rowlist.append(classif)
                rowlist.append(qty)
                rowlist.append(dispatchtime)
                rowlist.append(traveltime)
                rowlist.append(resoltime)
                newcsvlist.append(rowlist)
                rowlist = []
            else:
                pass
            
labels = ['Incident Date and Time','Zip Code','Count of Police Precincts','Alarm Source Description','Highest Alarm Level','Incident Classification','Total Quantity of Resources Dispatched', 'Dispatch Response Time (in seconds)','Incident Travel Time (in seconds)','Total Resolution Time (in seconds)']
df = pd.DataFrame.from_records(newcsvlist,columns = labels)	
df.to_csv('new.csv')