from fastai.vision.all import *
Introduction
I am following along with Chapter 4 of [1].
Download dataset
#collapse-output
= untar_data(URLs.MNIST_SAMPLE) path
So what got downloaded?
path.ls()
(#3) [Path('/data/kaushik/.fastai/data/mnist_sample/valid'),Path('/data/kaushik/.fastai/data/mnist_sample/train'),Path('/data/kaushik/.fastai/data/mnist_sample/labels.csv')]
That huge path is a pain to look at so shorten it by setting the BASE_PATH.
= path
Path.BASE_PATH path.ls()
(#3) [Path('valid'),Path('train'),Path('labels.csv')]
What do we have under train?
/'train').ls() (path
(#2) [Path('train/3'),Path('train/7')]
What do we have under the 7?
/'train'/'7').ls().sorted() (path
(#6265) [Path('train/7/10002.png'),Path('train/7/1001.png'),Path('train/7/10014.png'),Path('train/7/10019.png'),Path('train/7/10039.png'),Path('train/7/10046.png'),Path('train/7/10050.png'),Path('train/7/10063.png'),Path('train/7/10077.png'),Path('train/7/10086.png')...]
We have 6,265 images of sevens. Look at one using the PIL library.
open((path/'train'/'7').ls().sorted()[0]) Image.
= [tensor(Image.open(pic_path)).float()/255. for pic_path in (path/'train'/'7').ls().sorted()]
seven_tensors = [tensor(Image.open(pic_path)).float()/255. for pic_path in (path/'train'/'3').ls().sorted()]
three_tensors len(seven_tensors), len(three_tensors)
(6265, 6131)
Use Fastai convenience function show_image to display the tensor
0]) show_image(seven_tensors[
= torch.stack(seven_tensors)
stacked_sevens = torch.stack(three_tensors)
stacked_threes stacked_sevens.shape, stacked_threes.shape
(torch.Size([6265, 28, 28]), torch.Size([6131, 28, 28]))
Assemble train and validation set
Assemble the training data. Each input will be a vector of 784 values.
= torch.cat([stacked_threes, stacked_sevens]).view(-1,28*28)
train_x = tensor([1]*len(three_tensors) + [0]*len(seven_tensors)).unsqueeze(1)
train_y train_x.shape, train_y.shape
(torch.Size([12396, 784]), torch.Size([12396, 1]))
Assemble the validation data
= torch.stack([tensor(Image.open(pic_path)).float()/255. for pic_path in (path/'valid'/'7').ls().sorted()])
valid_7_tensors = torch.stack([tensor(Image.open(pic_path)).float()/255. for pic_path in (path/'valid'/'3').ls().sorted()])
valid_3_tensors valid_7_tensors.shape, valid_3_tensors.shape
(torch.Size([1028, 28, 28]), torch.Size([1010, 28, 28]))
= torch.cat([valid_3_tensors, valid_7_tensors]).view(-1,28*28)
valid_x = tensor([1]*len(valid_3_tensors) + [0]*len(valid_7_tensors)).unsqueeze(1)
valid_y valid_x.shape, valid_y.shape
(torch.Size([2038, 784]), torch.Size([2038, 1]))
Initialize Parameters
def init_params(size, std=1.0): return (torch.randn(size)*std).requires_grad_()
We have one weight for each pixel in the image. We will also have a bias term.
= init_params((28*28,1))
weights = init_params(1)
bias weights.shape, bias.shape
(torch.Size([784, 1]), torch.Size([1]))
Get Predictions
Calculate the prediction for a single image
0]*weights.T).sum() + bias (train_x[
tensor([3.9740], grad_fn=<AddBackward0>)
0].shape, weights.shape, weights.T.shape, (train_x[0]*weights).shape train_x[
(torch.Size([784]),
torch.Size([784, 1]),
torch.Size([1, 784]),
torch.Size([784, 784]))
def linear1(xb): return xb @ weights + bias
= linear1(train_x)
preds preds
tensor([[ 3.9740],
[-0.7695],
[ 2.8669],
...,
[-2.1245],
[-6.0483],
[-9.3377]], grad_fn=<AddBackward0>)
Compute Loss
def sigmoid(x): return 1./(1. + torch.exp(-x))
def mnist_loss(predictions, targets):
= predictions.sigmoid()
preds return torch.where(targets==1, 1-preds, preds).mean()
mnist_loss(preds, train_y)
tensor(0.3764, grad_fn=<MeanBackward0>)
Mini-Batches
= L(enumerate(string.ascii_lowercase))
ds ds
(#26) [(0, 'a'),(1, 'b'),(2, 'c'),(3, 'd'),(4, 'e'),(5, 'f'),(6, 'g'),(7, 'h'),(8, 'i'),(9, 'j')...]
Each mini-batch is a tuple of some number of training examples and their corresponding labels
= DataLoader(ds,batch_size=5,shuffle=True)
dl list(dl) first(dl),
((tensor([ 5, 20, 14, 11, 13]), ('f', 'u', 'o', 'l', 'n')),
[(tensor([ 1, 13, 14, 7, 0]), ('b', 'n', 'o', 'h', 'a')),
(tensor([ 8, 16, 4, 17, 19]), ('i', 'q', 'e', 'r', 't')),
(tensor([11, 6, 2, 20, 15]), ('l', 'g', 'c', 'u', 'p')),
(tensor([ 9, 18, 21, 12, 22]), ('j', 's', 'v', 'm', 'w')),
(tensor([24, 25, 10, 3, 5]), ('y', 'z', 'k', 'd', 'f')),
(tensor([23]), ('x',))])
= list(zip(train_x, train_y))
dset = DataLoader(dset, batch_size=256)
dl
= list(zip(valid_x, valid_y))
valid_dset = DataLoader(valid_dset, batch_size=256) valid_dl
Simulate a batch of training examples.
= train_x[:4]
batch batch.shape
torch.Size([4, 784])
= linear1(batch)
preds preds
tensor([[ 3.9740],
[-0.7695],
[ 2.8669],
[-6.0669]], grad_fn=<AddBackward0>)
= mnist_loss(preds, train_y[:4])
loss loss
tensor(0.4383, grad_fn=<MeanBackward0>)
loss.backward() weights.grad.shape, weights.grad.mean(), bias.grad
(torch.Size([784, 1]), tensor(-0.0103), tensor([-0.0719]))
Gradient computation
def calc_grad(xb,yb,model):
= model(xb)
preds = mnist_loss(preds, yb)
loss loss.backward()
Train
def train_epoch(model, lr, params):
for xb, yb in dl:
calc_grad(xb,yb,model)for p in params:
-= p.grad*lr
p.data p.grad.zero_()
Batch Accuracy
def batch_accuracy(preds, yb):
= preds.sigmoid() #note
preds = (preds > 0.5).float() == yb
correct return correct.float().mean()
4]) batch_accuracy(linear1(batch), train_y[:
tensor(0.5000)
def validate_epoch(model):
= [batch_accuracy(model(xb), yb) for xb, yb in valid_dl]
accs return round(torch.stack(accs).mean().item(), 4)
validate_epoch(linear1)
0.6533
Train one epoch
= 1
lr
= init_params((28*28,1))
weights = init_params(1)
bias = weights, bias
params
train_epoch(linear1, lr, params) validate_epoch(linear1)
0.6836
Train multiple epochs
= 1
lr
= init_params((28*28,1))
weights = init_params(1)
bias = weights, bias
params
for i in range(20):
train_epoch(linear1, lr, params)print(validate_epoch(linear1), end=' ')
0.6765 0.8387 0.8988 0.9267 0.9374 0.9462 0.954 0.9565 0.9599 0.9618 0.9638 0.9653 0.9667 0.9682 0.9682 0.9706 0.9711 0.9721 0.9721 0.9721
Refactor
Replace gradient update with a home grown optimizer
class BasicOptim(nn.Module):
def __init__(self,params,lr): self.params, self.lr = list(params), lr
def step(self,*args,**kwargs):
for p in self.params: p.data -= p.grad*self.lr
def zero_grad(self,*args,**kwargs):
for p in self.params: p.grad = None
= 1
lr
= init_params((28*28,1))
weights = init_params(1)
bias = weights, bias
params
= BasicOptim(params, lr)
opt
def train_epoch(model, lr, params):
for xb, yb in dl:
calc_grad(xb,yb,model)
opt.step()
opt.zero_grad()
for i in range(20):
train_epoch(linear1, lr, params)print(validate_epoch(linear1), end=' ')
0.6269 0.8379 0.9174 0.9409 0.9496 0.9545 0.9589 0.9623 0.9658 0.9672 0.9687 0.9697 0.9692 0.9697 0.9721 0.9721 0.9721 0.9726 0.9726 0.9731
Replace init_params and linear1 with Pytorch nn.Linear
nn.Linear holds both the weights and bias and takes care of initializing the parameters.
= nn.Linear(28*28,1)
linear_model = linear_model.parameters()
weights, bias weights.shape, bias.shape
(torch.Size([1, 784]), torch.Size([1]))
= 1
lr
= nn.Linear(28*28,1)
linear_model = BasicOptim(linear_model.parameters(), lr)
opt
def train_epoch(model, lr, params):
for xb, yb in dl:
calc_grad(xb,yb,model)
opt.step()
opt.zero_grad()
for i in range(20):
train_epoch(linear_model, lr, params)print(validate_epoch(linear_model), end=' ')
0.4932 0.4932 0.6816 0.8687 0.9185 0.936 0.9502 0.958 0.9638 0.9658 0.9678 0.9697 0.9712 0.9741 0.9746 0.9761 0.9765 0.9775 0.9785 0.9785
Replace home grown optimizer with fastai SGD
= 1
lr
= nn.Linear(28*28,1)
linear_model = SGD(linear_model.parameters(), lr)
opt
def train_epoch(model, lr, params):
for xb, yb in dl:
calc_grad(xb,yb,model)
opt.step()
opt.zero_grad()
for i in range(20):
train_epoch(linear_model, lr, params)print(validate_epoch(linear_model), end=' ')
0.4932 0.8447 0.8398 0.9126 0.9336 0.9478 0.9551 0.9629 0.9658 0.9678 0.9692 0.9717 0.9741 0.9751 0.9761 0.9761 0.977 0.978 0.9785 0.9785
Replace training loop with Fastai Learner.fit
Observe how we are able to pass in the mnist_loss and batch_accuracy functions. Recall that within these functions we pass the predictions through the sigmoid function.
#collapse-output
= DataLoaders(dl, valid_dl)
dls = Learner(dls, nn.Linear(28*28,1), loss_func=mnist_loss
learn =SGD
, opt_func= batch_accuracy)
, metrics 20,lr=1) learn.fit(
epoch | train_loss | valid_loss | batch_accuracy | time |
---|---|---|---|---|
0 | 0.637023 | 0.503487 | 0.495584 | 00:00 |
1 | 0.524831 | 0.192524 | 0.839058 | 00:00 |
2 | 0.192777 | 0.178215 | 0.840039 | 00:00 |
3 | 0.084406 | 0.105942 | 0.912169 | 00:00 |
4 | 0.044523 | 0.077318 | 0.933268 | 00:00 |
5 | 0.028958 | 0.061924 | 0.947988 | 00:00 |
6 | 0.022560 | 0.052394 | 0.955839 | 00:00 |
7 | 0.019714 | 0.046067 | 0.962709 | 00:00 |
8 | 0.018272 | 0.041615 | 0.966143 | 00:00 |
9 | 0.017409 | 0.038327 | 0.967125 | 00:00 |
10 | 0.016804 | 0.035799 | 0.969578 | 00:00 |
11 | 0.016330 | 0.033789 | 0.972031 | 00:00 |
12 | 0.015934 | 0.032144 | 0.973503 | 00:00 |
13 | 0.015597 | 0.030771 | 0.974975 | 00:00 |
14 | 0.015305 | 0.029609 | 0.975957 | 00:00 |
15 | 0.015053 | 0.028614 | 0.976938 | 00:00 |
16 | 0.014832 | 0.027755 | 0.977429 | 00:00 |
17 | 0.014637 | 0.027009 | 0.977920 | 00:00 |
18 | 0.014463 | 0.026354 | 0.978410 | 00:00 |
19 | 0.014305 | 0.025776 | 0.978901 | 00:00 |
Add a rectified linear unit
#collapse-output
= nn.Sequential(nn.Linear(28*28,30),
simple_net
nn.ReLU(), 30,1))
nn.Linear(
= Learner(dls, simple_net, loss_func=mnist_loss
learn =SGD
, opt_func=batch_accuracy)
, metrics
=40, lr=0.1) learn.fit(n_epoch
epoch | train_loss | valid_loss | batch_accuracy | time |
---|---|---|---|---|
0 | 0.334276 | 0.397611 | 0.510304 | 00:00 |
1 | 0.153756 | 0.235990 | 0.795878 | 00:00 |
2 | 0.084339 | 0.117733 | 0.915113 | 00:00 |
3 | 0.054809 | 0.079118 | 0.940137 | 00:00 |
4 | 0.041189 | 0.061640 | 0.954367 | 00:00 |
5 | 0.034297 | 0.051851 | 0.963690 | 00:00 |
6 | 0.030381 | 0.045687 | 0.965653 | 00:00 |
7 | 0.027859 | 0.041476 | 0.966634 | 00:00 |
8 | 0.026048 | 0.038415 | 0.968106 | 00:00 |
9 | 0.024647 | 0.036075 | 0.969578 | 00:00 |
10 | 0.023509 | 0.034219 | 0.971541 | 00:00 |
11 | 0.022557 | 0.032701 | 0.973013 | 00:00 |
12 | 0.021745 | 0.031426 | 0.973503 | 00:00 |
13 | 0.021041 | 0.030334 | 0.973503 | 00:00 |
14 | 0.020424 | 0.029385 | 0.973994 | 00:00 |
15 | 0.019877 | 0.028547 | 0.975466 | 00:00 |
16 | 0.019388 | 0.027802 | 0.976938 | 00:00 |
17 | 0.018946 | 0.027133 | 0.977920 | 00:00 |
18 | 0.018545 | 0.026529 | 0.977920 | 00:00 |
19 | 0.018177 | 0.025982 | 0.977920 | 00:00 |
20 | 0.017839 | 0.025482 | 0.978901 | 00:00 |
21 | 0.017526 | 0.025026 | 0.978901 | 00:00 |
22 | 0.017234 | 0.024606 | 0.979882 | 00:00 |
23 | 0.016962 | 0.024220 | 0.980373 | 00:00 |
24 | 0.016708 | 0.023863 | 0.981354 | 00:00 |
25 | 0.016468 | 0.023533 | 0.981354 | 00:00 |
26 | 0.016243 | 0.023226 | 0.981354 | 00:00 |
27 | 0.016030 | 0.022941 | 0.981354 | 00:00 |
28 | 0.015829 | 0.022676 | 0.981845 | 00:00 |
29 | 0.015638 | 0.022429 | 0.981845 | 00:00 |
30 | 0.015457 | 0.022199 | 0.982826 | 00:00 |
31 | 0.015285 | 0.021983 | 0.982826 | 00:00 |
32 | 0.015121 | 0.021782 | 0.982826 | 00:00 |
33 | 0.014964 | 0.021593 | 0.983317 | 00:00 |
34 | 0.014815 | 0.021416 | 0.982826 | 00:00 |
35 | 0.014672 | 0.021249 | 0.982826 | 00:00 |
36 | 0.014535 | 0.021091 | 0.982336 | 00:00 |
37 | 0.014403 | 0.020943 | 0.982336 | 00:00 |
38 | 0.014276 | 0.020802 | 0.982336 | 00:00 |
39 | 0.014154 | 0.020669 | 0.982336 | 00:00 |
learn.recorder records the output from the training process. The three items recorded here are train_loss, valid_loss and batch_accuracy
2] learn.recorder.values[:
[(#3) [0.334276407957077,0.39761149883270264,0.5103042125701904],
(#3) [0.1537560522556305,0.2359904646873474,0.7958782911300659]]
Plot how the accuracy evolved during the training.
2)) plt.plot(L(learn.recorder.values).itemgot(
The final accuracy is as below:
-1][2] learn.recorder.values[
0.98233562707901