# JUNTO Practice: Data Analysis, Chicago Buildings

Discussed on September 08, 2020.

Datasets:

Analyze these datasets. Create a short report on a pattern that you find.

Click to see:

# Chicago Permit and Violation Data

``````import pandas as pd
import os
import numpy as np
import math
``````
``````os.listdir("chicago_data")
``````
``````['chicago_FOIA_Request_Log_-_Buildings.csv',
'chicago_House_Share_Prohibited_Buildings_List.csv',
'chicago_Vacant_and_Abandoned_Buildings_-_Violations.csv',
'chicago_buildings_footprints.csv',
'chicago_Building_Permits.csv',
'chicago_construction_contracts.csv',
'chicago_Payments.csv',
'chicago_Building_Violations.csv',
'chicago_Ordinance_Violations__Buildings_.csv',
``````
``````permits = pd.read_csv(
"chicago_data/chicago_Building_Permits.csv"
)
"chicago_data/chicago_Building_Violations.csv"
)
``````
``````/opt/miniconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3072: DtypeWarning: Columns (1,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98) have mixed types. Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
``````

## Hypothesis 1: There is a relationship between number of permits and number of violations

``````print(permits.columns.values)
print(violations.columns.values)
``````
``````['ID' 'PERMIT#' 'PERMIT_TYPE' 'REVIEW_TYPE' 'APPLICATION_START_DATE'
'ISSUE_DATE' 'PROCESSING_TIME' 'STREET_NUMBER' 'STREET DIRECTION'
'STREET_NAME' 'SUFFIX' 'WORK_DESCRIPTION' 'BUILDING_FEE_PAID'
'ZONING_FEE_PAID' 'OTHER_FEE_PAID' 'SUBTOTAL_PAID' 'BUILDING_FEE_UNPAID'
'ZONING_FEE_UNPAID' 'OTHER_FEE_UNPAID' 'SUBTOTAL_UNPAID'
'BUILDING_FEE_WAIVED' 'ZONING_FEE_WAIVED' 'OTHER_FEE_WAIVED'
'SUBTOTAL_WAIVED' 'TOTAL_FEE' 'CONTACT_1_TYPE' 'CONTACT_1_NAME'
'CONTACT_1_CITY' 'CONTACT_1_STATE' 'CONTACT_1_ZIPCODE' 'CONTACT_2_TYPE'
'CONTACT_2_NAME' 'CONTACT_2_CITY' 'CONTACT_2_STATE' 'CONTACT_2_ZIPCODE'
'CONTACT_3_TYPE' 'CONTACT_3_NAME' 'CONTACT_3_CITY' 'CONTACT_3_STATE'
'CONTACT_3_ZIPCODE' 'CONTACT_4_TYPE' 'CONTACT_4_NAME' 'CONTACT_4_CITY'
'CONTACT_4_STATE' 'CONTACT_4_ZIPCODE' 'CONTACT_5_TYPE' 'CONTACT_5_NAME'
'CONTACT_5_CITY' 'CONTACT_5_STATE' 'CONTACT_5_ZIPCODE' 'CONTACT_6_TYPE'
'CONTACT_6_NAME' 'CONTACT_6_CITY' 'CONTACT_6_STATE' 'CONTACT_6_ZIPCODE'
'CONTACT_7_TYPE' 'CONTACT_7_NAME' 'CONTACT_7_CITY' 'CONTACT_7_STATE'
'CONTACT_7_ZIPCODE' 'CONTACT_8_TYPE' 'CONTACT_8_NAME' 'CONTACT_8_CITY'
'CONTACT_8_STATE' 'CONTACT_8_ZIPCODE' 'CONTACT_9_TYPE' 'CONTACT_9_NAME'
'CONTACT_9_CITY' 'CONTACT_9_STATE' 'CONTACT_9_ZIPCODE' 'CONTACT_10_TYPE'
'CONTACT_10_NAME' 'CONTACT_10_CITY' 'CONTACT_10_STATE'
'CONTACT_10_ZIPCODE' 'CONTACT_11_TYPE' 'CONTACT_11_NAME'
'CONTACT_11_CITY' 'CONTACT_11_STATE' 'CONTACT_11_ZIPCODE'
'CONTACT_12_TYPE' 'CONTACT_12_NAME' 'CONTACT_12_CITY' 'CONTACT_12_STATE'
'CONTACT_12_ZIPCODE' 'CONTACT_13_TYPE' 'CONTACT_13_NAME'
'CONTACT_13_CITY' 'CONTACT_13_STATE' 'CONTACT_13_ZIPCODE'
'CONTACT_14_TYPE' 'CONTACT_14_NAME' 'CONTACT_14_CITY' 'CONTACT_14_STATE'
'CONTACT_14_ZIPCODE' 'CONTACT_15_TYPE' 'CONTACT_15_NAME'
'CONTACT_15_CITY' 'CONTACT_15_STATE' 'CONTACT_15_ZIPCODE' 'REPORTED_COST'
'PIN1' 'PIN2' 'PIN3' 'PIN4' 'PIN5' 'PIN6' 'PIN7' 'PIN8' 'PIN9' 'PIN10'
'COMMUNITY_AREA' 'CENSUS_TRACT' 'WARD' 'XCOORDINATE' 'YCOORDINATE'
'LATITUDE' 'LONGITUDE' 'LOCATION']
'VIOLATION STATUS' 'VIOLATION STATUS DATE' 'VIOLATION DESCRIPTION'
'VIOLATION LOCATION' 'VIOLATION INSPECTOR COMMENTS' 'VIOLATION ORDINANCE'
'INSPECTOR ID' 'INSPECTION NUMBER' 'INSPECTION STATUS'
'INSPECTION WAIVED' 'INSPECTION CATEGORY' 'DEPARTMENT BUREAU' 'ADDRESS'
'STREET NUMBER' 'STREET DIRECTION' 'STREET NAME' 'STREET TYPE'
'PROPERTY GROUP' 'SSA' 'LATITUDE' 'LONGITUDE' 'LOCATION']
``````
``````def create_address(row):
street_number = row["STREET_NUMBER"]
street_number = (
str(street_number)
if not math.isnan(street_number)
else ""
)

street_direction = row[
"STREET DIRECTION"
]  # if not math.isnan(row["STREET DIRECTION"]) else ""

street_name = row[
"STREET_NAME"
]  # if not math.isnan(row["STREET_NAME"]) else ""

street_suffix = (
row["SUFFIX"]
if isinstance(row["SUFFIX"], str)
else ""
)

street_number,
street_direction,
street_name,
street_suffix,
]

)
``````
``````permit_counts = (
)
permit_counts.rename(
columns={"ID": "permits"}, inplace=True
)

violation_counts = (
.count()
)
violation_counts.rename(
columns={"ID": "violations"}, inplace=True
)
``````
``````by_permit_stats = pd.merge(
permit_counts,
violation_counts,
how="left",
)
``````
``````by_violation_stats = pd.merge(
permit_counts,
violation_counts,
how="right",
)
``````
``````address_stats = pd.merge(
permit_counts,
violation_counts,
how="outer",
)
``````
``````# Finding One
'''
probability of having a violation was .43

'''
print(
]
.isna()
.value_counts(normalize=True)
)

print(
]
.isna()
.value_counts(normalize=True)
)
``````
``````True     0.560393
False    0.439607
Name: violations, dtype: float64
False    0.633594
True     0.366406
Name: permits, dtype: float64
``````
``````address_stats.plot.scatter(
"permits", "violations", figsize=(20, 10)
)
``````
``````<matplotlib.axes._subplots.AxesSubplot at 0x18569fa90>
``````
``````bins = [i * 10 for i in range(10)]
# bins = [0,10,1000]
)
groups.plot.scatter("permits", "violations")
groups.mean()
``````
``````            permits  violations
permits
(0, 10]    1.965883   12.086148
(10, 20]  14.206584   20.265266
(20, 30]  24.747899   25.132353
(30, 40]  34.858586   23.684932
(40, 50]  45.335260   28.324841
(50, 60]  55.491228   30.333333
(60, 70]  64.370370   26.743243
(70, 80]  75.000000   40.677966
(80, 90]  85.571429   33.978723
``````
``````keys = [key for key, _ in groups]
zero_to_fifty = groups.get_group(keys[0]).index.values
``````
``````print(address_stats.loc[address_stats["violations"] > 300])
].index.values
print(outliers)
``````
``````                   permits  violations
11601 W TOUHY AVE   3966.0       606.0
1900 N AUSTIN AVE    115.0       642.0
``````

I did some research on these addresses TOUHY is an airport, AUSTIN is commercial warehouse offices.

However, based on the binned plots, it doesn't look like there's any meaningful relationship between permits and violations.

## Hypothesis 2: We can Predict Reported Cost With Permit Fees

``````# Interesting new avenuex
permits.plot.scatter("ID", "REPORTED_COST")
``````
``````<matplotlib.axes._subplots.AxesSubplot at 0x13d265810>
``````
``````import matplotlib.pyplot as plt
``````
``````permits.loc[
permits["REPORTED_COST"] < 100.0, "REPORTED_COST"
].value_counts()
print(permits["REPORTED_COST"].mean())
log_cost = np.log10(permits["REPORTED_COST"])

fig = plt.figure(figsize=(20, 20))

ax = plt.subplot()
ax.boxplot(log_cost.values[np.where(log_cost > 0)])

plt.show()

fig = plt.figure(figsize=(20, 20))

ax = plt.subplot()
ax.hist(log_cost.values[np.where(log_cost > 0)])

plt.show()
``````
``````214862.34834776595
``````
``````/opt/miniconda3/lib/python3.7/site-packages/pandas/core/series.py:856: RuntimeWarning: divide by zero encountered in log10
result = getattr(ufunc, method)(*inputs, **kwargs)
/opt/miniconda3/lib/python3.7/site-packages/pandas/core/series.py:856: RuntimeWarning: invalid value encountered in log10
result = getattr(ufunc, method)(*inputs, **kwargs)
``````
``````permits["log10_cost"] = log_cost
permits["log10_fees"] = np.log10(permits["TOTAL_FEE"])
``````
``````/opt/miniconda3/lib/python3.7/site-packages/pandas/core/series.py:856: RuntimeWarning: divide by zero encountered in log10
result = getattr(ufunc, method)(*inputs, **kwargs)
/opt/miniconda3/lib/python3.7/site-packages/pandas/core/series.py:856: RuntimeWarning: invalid value encountered in log10
result = getattr(ufunc, method)(*inputs, **kwargs)
``````
``````mask = (permits["log10_cost"] > 0) & (
permits["log10_fees"] > 0
)
"log10_cost", "log10_fees", figsize=(20, 10)
)
``````
``````<matplotlib.axes._subplots.AxesSubplot at 0x13e630410>
``````
``````import statsmodels.formula.api as sm
import math
``````
``````modeling_permits = permits.loc[mask, :]
``````
``````modeling_permits.rename(
columns={"STREET DIRECTION": "STREET_DIRECTION"},
inplace=True,
)
``````
``````/opt/miniconda3/lib/python3.7/site-packages/pandas/core/frame.py:4238: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
return super().rename(**kwargs)
``````
``````formula = f"log10_cost ~ log10_fees + C(STREET_DIRECTION)"
lm = sm.ols(
formula=formula, data=modeling_permits, missing="drop"
).fit()
lm.summary()
``````
``````<class 'statsmodels.iolib.summary.Summary'>
"""
OLS Regression Results
==============================================================================
Dep. Variable:             log10_cost   R-squared:                       0.551
Method:                 Least Squares   F-statistic:                 1.702e+05
Date:                Tue, 08 Sep 2020   Prob (F-statistic):               0.00
Time:                        16:32:55   Log-Likelihood:            -5.8860e+05
No. Observations:              554677   AIC:                         1.177e+06
Df Residuals:                  554672   BIC:                         1.177e+06
Df Model:                           4
Covariance Type:            nonrobust
============================================================================================
coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept                    0.6173      0.005    116.970      0.000       0.607       0.628
C(STREET_DIRECTION)[T.N]    -0.0163      0.004     -4.095      0.000      -0.024      -0.009
C(STREET_DIRECTION)[T.S]    -0.0837      0.004    -20.991      0.000      -0.092      -0.076
C(STREET_DIRECTION)[T.W]    -0.0444      0.004    -11.185      0.000      -0.052      -0.037
log10_fees                   1.3485      0.002    821.572      0.000       1.345       1.352
==============================================================================
Omnibus:                    24718.225   Durbin-Watson:                   1.742
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            86342.735
Skew:                          -0.029   Prob(JB):                         0.00
Kurtosis:                       4.932   Cond. No.                         21.9
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
"""
``````

I tried using just log10 fees, and then slowly increased the number of variables, but we get marginal increases in R-squared up to 0.601. Not great. It should be noted, however, that at this point we haven't tried using the text at all.

## Let's see if Machine Learning can do better by integrating text

``````from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import (
CountVectorizer,
TfidfTransformer,
)
from sklearn import svm
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
``````
``````from sklearn.model_selection import train_test_split
``````
``````modeling_permits["WORK_DESCRIPTION"] = modeling_permits[
"WORK_DESCRIPTION"
].fillna("")
``````
``````/opt/miniconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
``````
``````X = modeling_permits[["log10_fees", "WORK_DESCRIPTION"]]
# X = X.reset_index().iloc[:1:]
y = modeling_permits["log10_cost"]
# y = y.reset_index().iloc[:,1:]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25
)
``````
``````text_clf = Pipeline(
[
(
"text",
ColumnTransformer(
transformers=[
(
"WORK_DESCRIPTION",
Pipeline(
[
("vect", CountVectorizer()),
(
"tfidf",
TfidfTransformer(),
),
]
),
"WORK_DESCRIPTION",
)
],
remainder="passthrough",
),
),
("clf", svm.LinearSVR()),
]
)

text_clf.fit(X_train, y_train)
``````
``````/opt/miniconda3/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
"the number of iterations.", ConvergenceWarning)
``````
``````Pipeline(
steps=[
(
"text",
ColumnTransformer(
remainder="passthrough",
transformers=[
(
"WORK_DESCRIPTION",
Pipeline(
steps=[
("vect", CountVectorizer()),
(
"tfidf",
TfidfTransformer(),
),
]
),
"WORK_DESCRIPTION",
)
],
),
),
("clf", LinearSVR()),
]
)
``````
``````print(f"R-squared:{text_clf.score(X_test,y_test)}")
``````
``````R-squared:0.7275009762794954
``````

This R-Squared is substantially better than our OLS one. This is with very simple one-hot-encoding to integrate text. If we used a more complex embedding of the text, we may be able to get even better results.

# How long does the City of Chicago take to process building permits?

In the City of Chicago, most construction work requires a building permit. Two questions you may ask about a building permit are "how much will it cost?" and "how long will it take to be approved?" The City provides a tool that calculates the cost of a permit, but does not provide information on how long it will take. The goal of this research is to create a set of "rules" that can be used by people to develop a sense of how long their permit application will take. Using publicly available data, I am able to construct this set of rules. However, for some permit types, there is substantial unexplained variance, which limits the usefulness of these rules.

## Introduction

In Chicago, most construction work requires a building permit.

If I need a permit, I want to know:

1. How much will it cost? (Cost question.)
2. How long will it take to be approved? (Time question.)

The City of Chicago answers the cost question by providing a calculator on their website ("Calculate the Cost of a Building Permit", n.d.).

But it's difficult for me to find an answer to the time question. I was unable to find anything on the City of Chicago's website. Diechmann (2017) claims that the average permit is approved within 70 days. However,

• I want to know the approval time based on permit type. (I have a hunch that certain permit types take longer to be approved than other types.)
• I don't want to know the average approval time. Instead, I want to know the approval time for a quantile. (E.g. "80% of permits are approved within Y days.")

The goal of this research is to create a list of "rules" that say something like:

You can expect a permit of type X to be approved within Y days.

## Methods

The City of Chicago provides a dataset of building permits ("Building Permits", n.d.).

I downloaded a comma-separated value (CSV) file of the dataset and imported it into a SQLite (Hipp et al. "SQLite") database, `permit.db`:

``````\$ sqlite3 permit.db
sqlite> .mode csv
sqlite> .import Building_Permits.csv Building_Permit
``````

I examined the table `Building_Permit` and identified 4 columns of interest:

ColumnNote*
`permit#`"Tracking number assigned at beginning of permit application process."
`permit_type`"Type of permit."
`processing_time`"Number of days between `application_start_date` and `issue_date`."**
`issue_date`"Date when City determined permit ready to issue, subject to payment of permit fees."

* Column definitions are from "Building Permits" (n.d.).

** `aplication_start_date` is when "[the] City began reviewing permit application".

Then I created a table of cleaned data, `Building_Permit_Clean`, that:

• Skips rows with a negative processing time.
• Changes the date format in `issue_date` from `MM/DD/YYYY` to `YYYY-MM-DD`.

Here's the table schema:

``````CREATE TABLE Building_Permit_Clean (
permit_no     TEXT     NOT NULL,
permit_type   TEXT     NOT NULL,
process_days  INTEGER  NOT NULL,
issue_date    TEXT     NOT NULL,
--
PRIMARY KEY (permit_no),
CHECK (permit_no <> ''),
CHECK (permit_type <> ''),
CHECK (process_days >= 0)
);
``````

And here's the SQL code that inserts data into `Building_Permit_Clean` from the data in `Building_Permit`:

``````INSERT INTO Building_Permit_Clean
(permit_no, permit_type, process_days, issue_date)
SELECT "permit#"
, permit_type
, CAST(processing_time AS INTEGER)
, mdy_to_ymd(issue_date)
FROM Building_Permit
WHERE CAST(processing_time AS INTEGER) >= 0
``````

NOTE: `mdy_to_ymd` is an application-defined function (Hipp et al. "Application-Defined SQL Functions") that I created that converts dates from the `MM/DD/YYYY` format (e.g. `09/02/2020`) to the `YYYY-MM-DD` format (e.g. `2020-09-02`).

I decided to measure 90% and 99% quantiles of `process_days` for each distinct value of `permit_type`.

I only used data for permits issued in 2019. Here's a SQL query that generates this data:

``````SELECT permit_type, process_days
FROM Building_Permit_Clean
WHERE strftime('%Y', issue_date) = '2019'
``````

To aggregate the data and measure the quantiles, I used the programming language R (R Core Team, 2018) with these software packages:

• DBI (R-SIG-DB, 2019).
• RSQLite (Müller, 2020).

## Results

48,676 permits were issued in 2019.

There are 9 distinct permit types. Here are their counts:

Permit TypeCount
Electric Wiring15,804
Easy Permit Process14,190
Renovation/Alteration9,481
Signs4,035
Elevator Equipment1,684
New Construction1,633
Wrecking/Demolition1,084
Reinstate Revoked Permit324
Scaffolding441
TOTAL48,676

Here are the 90% and 99% quantiles for processing time based on permit type:

Permit TypeProcessing Time (days)
Q=90%Q=99%
Scaffolding04
Reinstate Revoked Permit021
Easy Permit Process127
Elevator Equipment214
Electric Wiring644
Renovation/Alteration108353
New Construction111469
Wrecking/Demolition120449
Signs2541,141

Here are the derived rules:

Based on 2019 data:

• For a Scaffolding permit, 90% are processed same-day. 99% are processed within 4 days.
• For a Reinstate Revoked Permit permit, 90% are processed same-day. 99% are processed within 21 days.
• For an Easy Permit Process permit, 90% of applications are processed within 1 day. 99% are processed within 27 days.
• For an Elevator Equipment permit, 90% are processed within 2 days. 99% are processed within 14 days.
• For an Electric Wiring permit, 90% are processed within 6 days. 99% are processed within 44 days.
• For a Renovation/Alteration permit, 90% are processed within 108 days. 99% are processed within 353 days.
• For a New Construction permit, 90% are processed within 111 days. 99% are processed within 469 days.
• For a Wrecking/Demolition permit, 90% are processed within 120 days. 99% are processed within 449 days.
• For a Signs permit, 90% are processed within 254 days. 99% are processed within 1,141 days.

## Discussion

I was able to create the list of rules that I set out to create.

However, it is clear that there is unexplained variance for several permits. E.g.

• For a Signs permit, 90% are processed within 254 days. 99% are processed within 1,141 days.

What explains this 887 day difference between the 90% and 99% quantiles?

The building permit dataset contains another column, `work_description`, that is a "[d]escription of work authorized by the permit." ("Building Permits", n.d.) I hypothesize that the work descriptions contains information that can explain this variance in the Signs permit.

## References

### Daniel Bassett

This is not a finished solution. I'm building a neural net with Pytorch to approximate a function that answers the question:

Given certain building permit variables (fees, paid fees, cost of project, processing time, etc.), what is the probability that the applicant will violate their permit?

The building of this net is still in progress. First, I will finish a net that can approximate processing time given parameters. Then, I will finish the net by optimizing and adding more variables in order to approximate violation probability. I expect to have this completed by the next meeting.

Below is the code for what I have started thus far.

``````import pandas as pd
import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import StandardScaler
``````
``````# Create a neural net with Pytorch in order to calculate
# process time function given set of variables from building
# permit

# This is unfinished and will not run properly.

class Chicago_Permits(Dataset):
def __init__(self, file_name):

x = file.iloc[0:10000, 0:3].values
y = file.iloc[0:10000, 1].values

ss = StandardScaler()
x_train = ss.fit_transform(x)
y_train = y

self.x_tensor = torch.tensor(
x_train, dtype=torch.float32
)
self.y_tensor = torch.tensor(y_train)
``````
``````def __len__(self):
return len(self.y_tensor)

def __getitem__(self, idx):
return self.x_tensor[idx], self.y_tensor[idx]
``````

``````import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_chicago import Chicago_Permits
``````
``````ds = Chicago_Permits(
"BuildingPermits.csv",
usecols=[
"TOTAL_FEE",
"SUBTOTAL_PAID",
"REPORTED_COST",
"PROCESSING_TIME",
],
)

feature_set, batch_size=10, shuffle=True
)

class Net(nn.Module):
def __init__(self, fee, paid, cost):
super(Net, self).__init__()
self.L1 = Linear(fee, 1)
self.L2 = Linear(paid, 1)
self.L3 = Linear(cost, 1)
self.activation = Sigmoid()

def forward(self, X):
x = self.L1(x)
x = self.L2(x)
x = self.L3(x)
x = self.activation(x)
return x
``````
``````learning_rate = 0.01
for f in net.parameters():
``````optimizer = optim.SGD(net.parameters(), lr=0.01)