Return to JUNTO

JUNTO Practice: Data Analysis, Chicago Buildings (part 3)

Discussed on October 07, 2020.

Datasets:

Continue your analysis from the previous meeting. Expand on your original work or find new problems to work on.

Solutions

Click to see:

Oscar Martinez

Junto Chicago 3

Summary

Last week we tried moving from a one-hot-encoded support vector machine to word2vec word embedding representation of work descriptions. We saw little improvement on regression and great performance on classification.

This week, after trying many different improvements and hypotheses, we ultimately decide to try out a State of the Art Natural Language Processing method by fine-tuning BERT to our task.

BERT does not substantially improve our classification of permit types, but the simplicity of the implementation and matching of our benchmark support vector machine is a good first step towards using BERT for more complex tasks in the dataset.

!pip install transformers
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()
# The device name should look like the following:
if device_name == "/device:GPU:0":
    print("Found GPU at: {}".format(device_name))
else:
    raise SystemError("GPU device not found")
Found GPU at: /device:GPU:0
import torch

# If there's a GPU available...
if torch.cuda.is_available():
    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")
    print(
        "There are %d GPU(s) available."
        % torch.cuda.device_count()
    )
    print(
        "We will use the GPU:",
        torch.cuda.get_device_name(0),
    )
# If not...
else:
    print("No GPU available, using the CPU instead.")
    device = torch.device("cpu")
There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB
import os

os.listdir()
import pandas as pd

permit_descriptions = pd.read_csv(
    "permits_text.csv",
    usecols=["WORK_DESCRIPTION", "PERMIT_TYPE"],
)
permit_descriptions.head(10)
                                    WORK_DESCRIPTION                     PERMIT_TYPE
0  INSTALL 1" OR 2" EMPTY CONDUIT FOR AT&T FIBER ...        PERMIT - ELECTRIC WIRING
1  NON-ILLUMINATED ALUMINUM BLADE SIGN WITH DOUBL...                  PERMIT - SIGNS
2  NON-ILLUMINATED ALUMINUM BLADE SIGN WITH DOUBL...                  PERMIT - SIGNS
3                                      PORCH REPAIRS  PERMIT - RENOVATION/ALTERATION
4                                  LOW VOLTAGE ALARM        PERMIT - ELECTRIC WIRING
5  ELECTRICAL INSTALLATIONS AND REVISIONS OF THE ...        PERMIT - ELECTRIC WIRING
6          INSTALLATION OF LOW VOLTAGE BURGLAR ALARM        PERMIT - ELECTRIC WIRING
7  ADDITION OF STAIR TO CONNECT UNITS 27A WITH 28...  PERMIT - RENOVATION/ALTERATION
8  RECONFIGURE INTERIOR STAIRWELL, RECONFIGURE EX...  PERMIT - RENOVATION/ALTERATION
9                                 AUGUST MAINTENANCE        PERMIT - ELECTRIC WIRING
permit_descriptions[
    "WORK_DESCRIPTION"
] = permit_descriptions["WORK_DESCRIPTION"].fillna("")
permit_descriptions["WORK_DESCRIPTION"].apply(len).min()
0
permit_descriptions = permit_descriptions.loc[
    -(
        permit_descriptions["WORK_DESCRIPTION"].apply(len)
        < 3
    )
]
permit_descriptions["WORK_DESCRIPTION"].apply(len).mean()
96.54883743589929
test_data = permit_descriptions.sample(100000)
label_mapper = {
    l: i
    for i, l in enumerate(
        test_data["PERMIT_TYPE"].value_counts().index
    )
}
label_mapper
{'PERMIT - EASY PERMIT PROCESS': 1,
 'PERMIT - ELECTRIC WIRING': 0,
 'PERMIT - ELEVATOR EQUIPMENT': 5,
 'PERMIT - FOR EXTENSION OF PMT': 10,
 'PERMIT - NEW CONSTRUCTION': 4,
 'PERMIT - PORCH CONSTRUCTION': 7,
 'PERMIT - REINSTATE REVOKED PMT': 9,
 'PERMIT - RENOVATION/ALTERATION': 2,
 'PERMIT - SCAFFOLDING': 8,
 'PERMIT - SIGNS': 3,
 'PERMIT - WRECKING/DEMOLITION': 6}
test_data["task_label"] = test_data.PERMIT_TYPE.apply(
    lambda ptp: label_mapper[ptp]
)
from sklearn.model_selection import train_test_split
test_data.reset_index(inplace=True)
train, test = train_test_split(test_data, test_size=0.2)
train, test
(        index  ... task_label
 63889  242707  ...          2
 94022  329636  ...          0
 76860  488123  ...          0
 31005  338678  ...          2
 7511   459993  ...          1
 ...       ...  ...        ...
 68670  304684  ...          2
 92108    7650  ...          3
 44546  181511  ...          1
 49382  481054  ...          1
 28955  148422  ...          0
 
 [80000 rows x 4 columns],         index  ... task_label
 17619  199324  ...          0
 23807  470668  ...          3
 27166  153212  ...          0
 13058  294012  ...          2
 55666  402124  ...          2
 ...       ...  ...        ...
 11293    2099  ...          2
 8865   334439  ...          0
 1153   374233  ...          1
 85238  252232  ...          4
 54355  295877  ...          1
 
 [20000 rows x 4 columns])
train, val = train_test_split(train, test_size=0.2)
from transformers import AutoTokenizer

# Load the BERT tokenizer.
print("Loading BERT tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-uncased", do_lower_case=True
)
Loading BERT tokenizer...

test_encodings = tokenizer(
    test["WORK_DESCRIPTION"].tolist(),
    padding=True,
    truncation=True,
    return_tensors="pt",
)
train_encodings = tokenizer(
    train["WORK_DESCRIPTION"].tolist(),
    padding=True,
    truncation=True,
    return_tensors="pt",
)
val_encodings = tokenizer(
    val["WORK_DESCRIPTION"].tolist(),
    padding=True,
    truncation=True,
    return_tensors="pt",
)
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    return_dict=True,
    num_labels=11,
)
model.to(device)
model.train()
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=1e-5)
class PermitDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        }
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = PermitDataset(
    train_encodings, train["task_label"].to_list()
)
test_dataset = PermitDataset(
    test_encodings, test["task_label"].to_list()
)
val_dataset = PermitDataset(
    val_encodings, val["task_label"].to_list()
)
from torch.utils.data import DataLoader

train_loader = DataLoader(
    train_dataset, batch_size=16, shuffle=True
)
for epoch in range(3):
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        loss = outputs[0]
        loss.backward()
        optimizer.step()
(torch.argmax(outputs[1], 1) == labels).sum().item()
16
val_loader = DataLoader(
    val_dataset, batch_size=16, shuffle=True
)
model.eval()
correct = 0
total = 0
for batch in val_loader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)
    outputs = model(
        input_ids,
        attention_mask=attention_mask,
        labels=labels,
    )

    total += outputs[1].size(0)
    correct += (
        (torch.argmax(outputs[1], 1) == labels).sum().item()
    )

    print(f"Val Accuracy {100 * correct / total}")

print(f"Total Val Accuracy {100 * correct / total}")
Val Accuracy 100.0
Val Accuracy 96.875
Val Accuracy 95.83333333333333
Val Accuracy 95.3125
Val Accuracy 93.75
Val Accuracy 93.75
Val Accuracy 94.64285714285714
Val Accuracy 92.96875
Val Accuracy 92.36111111111111
Val Accuracy 93.125
Val Accuracy 93.75
Val Accuracy 94.27083333333333
Val Accuracy 94.23076923076923
Val Accuracy 94.64285714285714
Val Accuracy 95.0
Val Accuracy 94.53125
Val Accuracy 94.48529411764706
Val Accuracy 94.79166666666667
...
Val Accuracy 96.7622745490982
Val Accuracy 96.75925925925925
Val Accuracy 96.75625
Total Val Accuracy 96.75625

John Lekberg

Quantifying the change in building permit processing times, year over year, in the City of Chicago

In the City of Chicago, most construction work requires a building permit. In my previous research, I created a list of rules that described the distribution of processing times, with the goal of predicting processing times for the future. But, I assumed that the distributions did not change year over year. This assumption was proven wrong later research, but I did not quantify the difference. This research quantifies the difference in distributions, and finds that in general, the distributions do change -- but not by much -- year over year. As a result, the derived ruleset can still be used to predict processing times for the future. However, they are unexplained outliers: in some years, the distributions change more dramatically.

Introduction

In the City of Chicago, most construction work requires a building permit. If I need a permit, I want to know:

  1. How much will it cost? (Cost question.)
  2. How long will it take to be processed? (Time question.)

The City of Chicago answers the cost question by providing a calculator on their website. ("Calculate the Cost of a Building Permit", n.d.)

But, I couldn't find an answer to the time question, so in my previous research I created a list of rules that describe the distribution of processing time based on permit.

However, when I built this list, I assumed that the distribution of processing times stays the same, year over year. I tested this assumption in my research from last meeting and found that the distributions do change year over year.

The next question to answer is "how much?". So, I need to quantify the differences in the distributions. Because, if the distributions are technically different, but practically the same, then the rules that I derived may still be useful.

The goal of this research is to quantify the difference in processing time distributions for the different permit types, year over year.

Methods

The City of Chicago provides a dataset for building permits ("Building Permits", n.d.). I downloaded this data as a comma-separated value (CSV) file and imported it into a SQLite (Hipp et al. "SQLite") database, named permit.db:

$ sqlite3 permit.db
> .mode csv
> .import Building_Permits.csv Permit_Raw

I identified 4 columns of interest in Permit_Raw:

permit#
Tracking number assigned at beginning of permit application process.
permit_type
Type of permit.
processing_time
Number of days between application_start_date and issue_date.
issue_date
Date when City determined permit ready to issue, subject to payment of permit fees.

Column descriptions are from "Building Permits" (n.d.).

Based on these columns, I cleaned the data into a new table with this schema:

CREATE TABLE Permit_Clean (
    id_pmt            TEXT     NOT NULL,
    cat_pmt_type      TEXT     NOT NULL,
    amt_pmt_proc_day  INTEGER  NOT NULL,
    dt_pmt_issue      TEXT     NOT NULL,
    
    PRIMARY KEY (id_pmt),

    CONSTRAINT 'cat_pmt_type non-empty'
        CHECK (cat_pmt_type <> ''),

    CONSTRAINT 'amt_pmt_proc_day non-negative'
        CHECK (amt_pmt_proc_day >= 0),

    CONSTRAINT 'dt_pmt_issue YYYY-MM-DD'
        CHECK (dt_pmt_issue LIKE '____-__-__')
);

I used all data from Permit_Raw except for:

Here's the SQL to do the cleaning:

INSERT INTO Permit_Clean
  (id_pmt, cat_pmt_type, amt_pmt_proc_day, dt_pmt_issue)
SELECT "permit#"
       , permit_type
       , CAST(processing_time AS INTEGER)
       , mdy_to_ymd(issue_date)
  FROM Permit_Raw
 WHERE CAST(processing_time AS INTEGER) >= 0
;

mdy_to_ymd is an application-defined function (Hipp et al. "Application-Defined SQL Functions") that I created that converts dates from MM/DD/YYYY to YYYY-MM-DD.

I quantify the difference in processing time distributions year-over-year for different permit types. To do this, I use the statistic from the Kolmogorov-Smirnov test ("Kolmogorov-Smirnov test", n.d.), defined as

D = supx | F(x) - G(x) |

Where F and G are the empirical distribution functions for both years of data.

I calculate this statistic using the R programming language (R Core Team, 2018), along with these software packages:

Results

Here are the results of calculating the Kolmogorov-Smirnov statistic for all permit types for the years 2015-2019. The data is from 2014-2019.

Table: Easy Permit Process.
YearPriorD
201520141.2%
201620150.4%
201720162.6%
201820171.3%
201920184.3%
Table: Signs.
YearPriorD
201520146.6%
201620158.5%
201720164.9%
201820175.8%
2019201811.2%
Table: Electric Wiring.
YearPriorD
2015201419.6%
201620154.9%
201720161.7%
201820171.2%
201920181.7%
Table: New Construction.
YearPriorD
201520143.6%
201620158.1%
201720164.4%
201820175.1%
201920183.8%
Table: Elevator Equipment.
YearPriorD
201520142.2%
201620155.8%
201720166.2%
201820174.2%
201920183.1%
Table: Renovation/Alteration.
YearPriorD
201520143.8%
201620152.5%
201720166.8%
2018201712.4%
201920185.4%
Table: Scaffolding.
YearPriorD
201520141.6%
201620151.2%
201720160.6%
201820171.4%
201920180.6%
Table: Wrecking/Demolition.
YearPriorD
2015201412.9%
201620157.8%
201720166.4%
201820173.5%
201920185.3%
Table: Reinstate Revoked Permit.
YearPriorD
2015201430.1%
201620152.0%
201720162.1%
201820172.2%
201920182.5%

Discussion

Overall, the year-over-year change in distribution, measured using the Kolmogorov-Smirnov statistic, is less than 10%.

5 of the 45 tests (11% of tests) measured a statistic greater than or equal to 10%:

  1. Signs in year 2019. Statistic = 11.2%.
  2. Electric Wiring in year 2015. Statistic = 19.6%.
  3. Renovation/Alteration in year 2018. Statistic = 12.4%.
  4. Wrecking/Demolition in year 2015. Statistic = 12.9%.
  5. Reinstate Revoked Permit in year 2015. Statistic = 30.1%.

I don't have an explanation for why these five are such outliers. Future research should try to explain these.

How should you interpret the Kolmogorov-Smirnov statistic? Here are some examples:

As a result, the fact the 89% (40 of 45 total) tests produced a statistic less than 10% indicates that the distributions do not change that much, year over year. This answers the question from my previous research, "even though the distributions are technically difference, are they practically the same?".

So, I think that the rules that I derived in my previous research are still useful. However, users should add a "fudge factor" that takes into account the slight changes in distribution year-over-year.

References

Daniel Bassett

Chicago Building Permits Neural Network

Introduction

Given our data set on Chicago building permits, I have decided to build a neural network using Pytorch that can estimate the processing time given variables such as the amount of the feed paid as well as the reported costs of the project. Further variables will be added to the network as a result of our early versions and tests. We will then seek to use these findings to predict the likelihood that someone is in violation of their permit.

1.0: Organization & Torchization

Our first step in the project will be to organize our data and filter out the “junk” such as columns with empty or erroneous values. We will also convert our data from it’s simple csv format into Pytorch tensors: a special kind of numpy-like array with float32 values. This will allow us to perform our more complex neural network operations later on.

import pandas as pd
import numpy as np
import torch
from matplotlib import pyplot as plt

permits = pd.read_csv("Building_Permits.csv", thousands=",")

clean = permits.dropna(axis=0)

subtotal_paid = clean["SUBTOTAL_PAID"]
total_fee = clean["TOTAL_FEE"]
processing_time = clean["PROCESSING_TIME"]
violation_id = clean["VIOLATION_ID"]
reported_cost = clean["REPORTED_COST"]

fee_ratio = subtotal_paid / total_fee

permits_numpy = np.array(
    [
        fee_ratio,
        subtotal_paid,
        total_fee,
        reported_cost,
        processing_time,
        violation_id,
    ],
    dtype=np.float32,
)

permits_torch = torch.from_numpy(permits_numpy)

fee_ratio_actual = np.where(
    np.logical_and(fee_ratio >= 0, fee_ratio <= 1)
)

processing_time_actual = np.where(
    np.less_equal(processing_time, 5000)
)
C:\Users\danie\Anaconda\lib\site-packages\IPython\core\interactiveshell.py:3071: DtypeWarning: Columns (1,2,3,4,5,8,9,10,11,25,26,27,28) have mixed types.Specify dtype option on import or set low_memory=False.
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,

1.1: Some Simple Plots

Now that we have organized our data and cleaned some of the junk, let’s do some basic plotting to get a sense if our proposed theory is accurate. We will plot our ‘fee ratio’ with the ‘processing time’ to see if there is any sort of relation.

fig = plt.figure(dpi=600)
plt.xlabel("Paid Fee Ratio")
plt.ylabel("Processing Time")
plt.plot(permits_numpy[0], permits_numpy[4], "bo")
[<matplotlib.lines.Line2D at 0x22730c9f160>]

2.0: Torch Net Fundamentals

After running some simple plots, we want to discover the deeper (no pun intended) relationship between our variables and expected processing time. If we can find a functional and robust relation, perhaps we can achieve our overall goal: a neural net predictor of processing time as a function of our given variables.

import torch.optim as optim
import torch.nn as nn
from sklearn import preprocessing

frt = permits_torch[2]  # fee ratio tensor
rct = permits_torch[3]  # reported cost tensor
ptt = permits_torch[4]  # processing time tensor

frt = torch.tensor(frt).unsqueeze(1)
ptt = torch.tensor(ptt).unsqueeze(1)
rct = torch.tensor(rct).unsqueeze(1)

n_samples = ptt.shape[0]
n_val = int(0.2 * n_samples)

shuffled_indices = torch.randperm(n_samples)

train_indices = shuffled_indices[:-n_val]
val_indices = shuffled_indices[-n_val:]

ptt_train = ptt[train_indices]
frt_train = frt[train_indices]
rct_train = rct[train_indices]

ptt_value = ptt[val_indices]
frt_value = frt[val_indices]
rct_value = rct[val_indices]

ptt_untrain = 0.1 * ptt_train
ptt_unval = 0.1 * ptt_value

linear_model = nn.Linear(1, 1)
linear_model(ptt_unval)
x = torch.ones(1)
x = torch.ones(10, 1)

linear_model = nn.Linear(1, 1)
optimizer = optim.SGD(linear_model.parameters(), lr=1e-2)

list(linear_model.parameters())
<ipython-input-9-e1efb3e2fea4>:9: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  frt = torch.tensor(frt).unsqueeze(1)
<ipython-input-9-e1efb3e2fea4>:10: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  ptt = torch.tensor(ptt).unsqueeze(1)
<ipython-input-9-e1efb3e2fea4>:11: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  rct = torch.tensor(rct).unsqueeze(1)
[Parameter containing:
 tensor([[0.5401]], requires_grad=True),
 Parameter containing:
 tensor([-0.8294], requires_grad=True)]

We organize our data and parameters into appropriate types. In addition, we define our key variables given their placement in our original torch sensor. The variables are defined as follows: * frt = fee ratio tensor * rct = reported cost tensor * ptt = processing time tensor

def training_loop(
    n_epochs,
    optimizer,
    model,
    loss_fn,
    ptt_train,
    ptt_value,
    frt_train,
    frt_value,
):
    for epoch in range(1, n_epochs + 1):
        pt_train = model(ptt_train)  # <1>
        loss_train = loss_fn(pt_train, frt_train)

        pt_val = model(ptt_value)  # <1>
        loss_val = loss_fn(pt_val, frt_value)

        optimizer.zero_grad()
        loss_train.backward()  # <2>
        optimizer.step()

        if epoch == 1 or epoch % 1000 == 0:
            print(
                f"Epoch {epoch}, Training loss {loss_train.item():.4f},"
                f" Validation loss {loss_val.item():.4f}"
            )

def loss_fn(pt, frt): squared_diffs = (pt - frt)**2 return squared_diffs.mean()

linear_model = nn.Linear(1, 1) # <1> optimizer = optim.Adam(linear_model.parameters(), lr=1e-2)

training_loop( n_epochs = 3000, optimizer = optimizer, model = linear_model, loss_fn = loss_fn, ptt_train = ptt_untrain, ptt_value = ptt_unval, frt_train = frt_train, frt_value = frt_value)

print() print(linear_model.weight) print(linear_model.bias)

def loss_fn(pt, frt):
    squared_diffs = (pt - frt) ** 2
    return squared_diffs.mean()


linear_model = nn.Linear(1, 1)  # <1>
optimizer = optim.Adam(linear_model.parameters(), lr=1e-2)

training_loop(
    n_epochs=3000,
    optimizer=optimizer,
    model=linear_model,
    loss_fn=loss_fn,
    ptt_train=ptt_untrain,
    ptt_value=ptt_unval,
    frt_train=frt_train,
    frt_value=frt_value,
)

print()
print(linear_model.weight)
print(linear_model.bias)
Epoch 1, Training loss 78878864.0000, Validation loss 59914564.0000
Epoch 1000, Training loss 78812888.0000, Validation loss 59848468.0000
Epoch 2000, Training loss 78761344.0000, Validation loss 59796900.0000
Epoch 3000, Training loss 78721384.0000, Validation loss 59756920.0000

Parameter containing:
tensor([[25.6018]], requires_grad=True)
Parameter containing:
tensor([28.6782], requires_grad=True)

Analysis

After defining our training loop and loss function, we execute our first training loop using the ‘Adam’ gradient descent method. While our relative difference between the training loss and validation loss looks okay, the extremely large values are a bit concerning. Furthermore, it seems our neural network is learning verly slowly. This could indicate: * We do not have enough variables OR * Our overall thesis about some kind of relationship is wrong.

Because we don’t want to abandon the thesis without further testing, let’s alter some of our parameters and then add more variables to see a difference. We should adjust our method of gradient descent to SGD to see if that makes a difference in our loss values.

linear_model = nn.Linear(1, 1)
optimizer = optim.SGD(linear_model.parameters(), lr=1e-2)

training_loop(
    n_epochs=3000,
    optimizer=optimizer,
    model=linear_model,
    loss_fn=nn.MSELoss(),  # <1>
    ptt_train=ptt_untrain,
    ptt_value=ptt_unval,
    frt_train=frt_train,
    frt_value=frt_value,
)

print()
print(linear_model.weight)
print(linear_model.bias)
Epoch 1, Training loss 78870432.0000, Validation loss 59906128.0000
Epoch 1000, Training loss 78098944.0000, Validation loss 59167068.0000
Epoch 2000, Training loss 78098944.0000, Validation loss 59167068.0000
Epoch 3000, Training loss 78098944.0000, Validation loss 59167068.0000

Parameter containing:
tensor([[23.6578]], requires_grad=True)
Parameter containing:
tensor([821.0441], requires_grad=True)

2.1: Adjustments

Changing the method of gradient descent did not make a large difference in our values. We shall next look at adjusting our linearity assumption.

seq_model = nn.Sequential(
    nn.Linear(1, 13), nn.Tanh(), nn.Linear(13, 1)  # <1>
)  # <2>
seq_model

[param.shape for param in seq_model.parameters()]

for name, param in seq_model.named_parameters():
    print(name, param.shape)
Sequential(
  (0): Linear(in_features=1, out_features=13, bias=True)
  (1): Tanh()
  (2): Linear(in_features=13, out_features=1, bias=True)
)
from collections import OrderedDict

seq_model = nn.Sequential(
    OrderedDict(
        [
            ("hidden_linear", nn.Linear(1, 8)),
            ("hidden_activation", nn.Tanh()),
            ("output_linear", nn.Linear(8, 1)),
        ]
    )
)

seq_model
Sequential(
  (hidden_linear): Linear(in_features=1, out_features=8, bias=True)
  (hidden_activation): Tanh()
  (output_linear): Linear(in_features=8, out_features=1, bias=True)
)
for name, param in seq_model.named_parameters():
    print(name, param.shape)
hidden_linear.weight torch.Size([8, 1])
hidden_linear.bias torch.Size([8])
output_linear.weight torch.Size([1, 8])
output_linear.bias torch.Size([1])
seq_model.output_linear.bias
Parameter containing:
tensor([-0.2399], requires_grad=True)
optimizer = optim.SGD(
    seq_model.parameters(), lr=1e-3
)  # <1>

training_loop(
    n_epochs=5000,
    optimizer=optimizer,
    model=seq_model,
    loss_fn=nn.MSELoss(),
    ptt_train=ptt_untrain,
    ptt_value=ptt_unval,
    frt_train=frt_train,
    frt_value=frt_value,
)

print("output", seq_model(ptt_unval))
print("answer", frt_value)
print("hidden", seq_model.hidden_linear.weight.grad)
Epoch 1, Training loss 77971072.0000, Validation loss 59061600.0000
Epoch 1000, Training loss 77970968.0000, Validation loss 59060816.0000
Epoch 2000, Training loss 77970632.0000, Validation loss 59060944.0000
Epoch 3000, Training loss 77970728.0000, Validation loss 59060132.0000
Epoch 4000, Training loss 77975160.0000, Validation loss 59055040.0000
Epoch 5000, Training loss 77970688.0000, Validation loss 59059828.0000
output tensor([[1004.1236],
        [ 654.4979],
        [1004.1236],
        ...,
        [ 654.4979],
        [ 654.4979],
        [ 654.5068]], grad_fn=<AddmmBackward>)
answer tensor([[175.],
        [ 50.],
        [240.],
        ...,
        [150.],
        [300.],
        [575.]])
hidden tensor([[ 1.4657e+02],
        [-5.6897e-02],
        [ 6.7035e-02],
        [ 4.1606e-01],
        [-4.1232e-02],
        [ 9.1959e-03],
        [-4.4917e-01],
        [ 4.8263e-01]])
from matplotlib import pyplot as plt

t_range = torch.arange(20.0, 90.0).unsqueeze(1)

fig = plt.figure(dpi=600)
plt.xlabel("Total Fees")
plt.ylabel("Processing Time")
plt.plot(ptt.numpy(), frt.numpy(), "o")
plt.plot(
    t_range.numpy(),
    seq_model(0.1 * t_range).detach().numpy(),
    "c-",
)
plt.plot(
    ptt.numpy(), seq_model(0.1 * ptt).detach().numpy(), "kx"
)
[<matplotlib.lines.Line2D at 0x22712a873d0>]

2.2: Changing Variables

Due to our lack of robustness with our ‘fee ratio’ variable, we are now going to run a similar neural network using reported cost instead. Let’s redefine our loss function and other aspects of the network.

def training_loop_two(
    n_epochs,
    optimizer,
    model,
    loss_fn,
    ptt_train,
    ptt_value,
    rct_train,
    rct_value,
):
    for epoch in range(1, n_epochs + 1):
        pt_train = model(ptt_train)  # <1>
        loss_train = loss_fn_two(pt_train, rct_train)

        pt_val = model(ptt_value)  # <1>
        loss_val = loss_fn(pt_val, rct_value)

        optimizer.zero_grad()
        loss_train.backward()  # <2>
        optimizer.step()

        if epoch == 1 or epoch % 1000 == 0:
            print(
                f"Epoch {epoch}, Training loss {loss_train.item():.4f},"
                f" Validation loss {loss_val.item():.4f}"
            )
def loss_fn_two(pt, rct):
    squared_diffs = (pt - rct) ** 2
    return squared_diffs.mean()


linear_model = nn.Linear(1, 1)  # <1>
optimizer = optim.Adam(linear_model.parameters(), lr=1e-2)

training_loop_two(
    n_epochs=3000,
    optimizer=optimizer,
    model=linear_model,
    loss_fn=loss_fn,
    ptt_train=ptt_untrain,
    ptt_value=ptt_unval,
    rct_train=rct_train,
    rct_value=rct_value,
)

print()
print(linear_model.weight)
print(linear_model.bias)
Epoch 1, Training loss 225899442077696.0000, Validation loss 1398942386356224.0000
Epoch 1000, Training loss 225899425300480.0000, Validation loss 1398942386356224.0000
Epoch 2000, Training loss 225899374968832.0000, Validation loss 1398942386356224.0000
Epoch 3000, Training loss 225899374968832.0000, Validation loss 1398942386356224.0000

Parameter containing:
tensor([[29.9907]], requires_grad=True)
Parameter containing:
tensor([30.3575], requires_grad=True)

Analysis

This is not good news. Our training and validation losses became larger, and our model seems to be again learning very slowly. This means that there may not be a great relation between reported cost and processing time; at least with those two variables isolated together. It follows that we must build a more robust neural network in order to find a solid relationship between our given variables and processing time. If that does not work, than the variables given may not be a great estimator for our dependent variable.

3.0: Expanding the Network

Seeing that our model was learning very slowly with just some basic parameters and variables, we now seek to make our neural net more robust via using additional variables. This will help us to assess the accuracy of our thesis that the given dataset variables have a relationship with the processing time.

We will first try to integrate our original fee variables rather than using our calculated ‘fee ratio’. Perhaps using a ‘made up’ ratio was affecting our accuracy.

print(total_fee.mean())
print(subtotal_paid.mean())
859.2772863504475
744.8584693186931

Interestingly, the mean of the subtotal paid does not match 100% with the total fee. Which means that on average someone applying for their permit does not pay the full fee due. It still could be the case that our ‘fee ratio’ value was causing erroneous calculations, so from here on out we will use the subtotal paid and total fee values separately.

My initial thought is to build a multi-layer perceptron using all of the tabular variables to predict processing time. We are going to try a slightly different neural network structure in addition to adding our new variables.

import pytorch_lightning as pl
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks.early_stopping import (
    EarlyStopping,
)
from torch.utils.data import (
    Dataset,
    DataLoader,
    random_split,
)


class Chi_Dataset(Dataset):
    def __init__(self):
        self.columns = clean

    def __len__(self):
        return len(self.columns)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        columns = self.columns.iloc[idx, 0:]

        y = columns["PROCESSING_TIME"]

        columns = columns[
            [
                "SUBTOTAL_PAID",
                "TOTAL_FEE",
                "REPORTED_COST",
                "COMMUNITY_AREA",
                "WARD",
            ]
        ]
        columns = columns.tolist()
        columns = torch.FloatTensor(columns)

        return columns, y
class LitClassifier(pl.LightningModule):
    def __init__(
        self,
        lr: float = 1e-3,
        num_workers: int = 4,
        batch_size: int = 32,
    ):
        super().__init__()
        self.lr = lr
        self.num_workers = num_workers
        self.batch_size = batch_size

        self.ln1 = nn.Linear(64 * 26 * 26, 16)
        self.relu = nn.ReLU()
        self.batchnorm = nn.BatchNorm1d(16)
        self.dropout = nn.Dropout2d(0.5)
        self.ln2 = nn.Linear(16, 5)

        self.ln4 = nn.Linear(5, 10)
        self.ln5 = nn.Linear(10, 10)
        self.ln6 = nn.Linear(10, 5)
        self.ln7 = nn.Linear(10, 1)

    def forward(self, col):

        col = self.ln4(col)
        col = self.relu(col)
        col = self.ln5(col)
        col = self.relu(col)
        col = self.ln6(col)
        col = self.relu(col)

        x = self.relu(col)

        return self.ln7(x)

    def training_step(self, batch, batch_idx):
        columns, y = batch

        criterion = torch.nn.L1Loss()
        y_pred = torch.flatten(self(columns))
        y_pred = y_pred.double()

        loss = criterion(y_pred, y)

        tensorboard_logs = {"train_loss": loss}
        return {"loss": loss, "log": tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        columns, y = batch

        criterion = torch.nn.L1Loss()
        y_pred = torch.flatten(self(columns))
        y_pred = y_pred.double()

        val_loss = criterion(y_pred, y)

        return {"val_loss": val_loss}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack(
            [x["val_loss"] for x in outputs]
        ).mean()
        tensorboard_logs = {"val_loss": avg_loss}
        return {
            "val_loss": avg_loss,
            "log": tensorboard_logs,
        }

    def test_step(self, batch, batch_idx):
        columns, y = batch

        criterion = torch.nn.L1Loss()
        y_pred = torch.flatten(self(columns))
        y_pred = y_pred.double()

        test_loss = criterion(y_pred, y)

        return {"test_loss": test_loss}

    def test_epoch_end(self, outputs):
        avg_loss = torch.stack(
            [x["test_loss"] for x in outputs]
        ).mean()
        logs = {"test_loss": avg_loss}
        return {
            "test_loss": avg_loss,
            "log": logs,
            "progress_bar": logs,
        }

    def configure_optimizers(self):
        return torch.optim.Adam(
            self.parameters(), lr=(self.lr)
        )

    def train_dataloader(self):
        return DataLoader(batch_size=self.batch_size)

    def val_dataloader(self):
        return DataLoader(batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(batch_size=self.batch_size)
if __name__ == "__main__":
    logger = pl_loggers.TensorBoardLogger(
        "lightning_logs", name="multi_input"
    )
    early_stop_callback = EarlyStopping(
        monitor="val_loss",
        min_delta=5000,
        patience=7,
        verbose=False,
        mode="min",
    )
model = LitClassifier()
trainer = pl.Trainer(
    logger=logger, early_stop_callback=early_stop_callback
)
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
lr_finder = trainer.lr_find(model)
fig = lr_finder.plot(suggest=True, show=True)
new_lr = lr_finder.suggestion()
model.hparams.lr = new_lr

  | Name      | Type        | Params
------------------------------------------
0 | ln1       | Linear      | 692 K 
1 | relu      | ReLU        | 0     
2 | batchnorm | BatchNorm1d | 32    
3 | dropout   | Dropout2d   | 0     
4 | ln2       | Linear      | 85    
5 | ln4       | Linear      | 60    
6 | ln5       | Linear      | 110   
7 | ln6       | Linear      | 55    
8 | ln7       | Linear      | 11
AttributeError: 'LitClassifier' object has no attribute 'val_set'
trainer.fit(model)
trainer.test(model)

  | Name      | Type        | Params
------------------------------------------
0 | ln1       | Linear      | 692 K 
1 | relu      | ReLU        | 0     
2 | batchnorm | BatchNorm1d | 32    
3 | dropout   | Dropout2d   | 0     
4 | ln2       | Linear      | 85    
5 | ln4       | Linear      | 60    
6 | ln5       | Linear      | 110   
7 | ln6       | Linear      | 55    
8 | ln7       | Linear      | 11
AttributeError: 'LitClassifier' object has no attribute 'val_set'