AML: 01 Deep Neural Network for MNIST¶

Based on https://github.com/Atcold/pytorch-Deep-Learning

Data and Libraries¶

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import numpy
import random

# set the PseudoRandom Generator Seeds for better reproducibility
# see here for more: https://pytorch.org/docs/stable/notes/randomness.html
torch.manual_seed(99)
random.seed(99)
numpy.random.seed(99)

# this 'device' will be used for training our model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0

Load the MNIST dataset¶

Observe that we set shuffle=True, which means that data is randomized

input_size  = 28*28   # images are 28x28 pixels
output_size = 10      # there are 10 classes

train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=64, shuffle=True)

test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=1000, shuffle=True)
classNames = [0,1,2,3,4,5,6,7,8,9]

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ../data/MNIST/raw/train-images-idx3-ubyte.gz

Extracting ../data/MNIST/raw/train-images-idx3-ubyte.gz to ../data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ../data/MNIST/raw/train-labels-idx1-ubyte.gz

Extracting ../data/MNIST/raw/train-labels-idx1-ubyte.gz to ../data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw/t10k-images-idx3-ubyte.gz

Extracting ../data/MNIST/raw/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz

Extracting ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw
Processing...
Done!

/opt/conda/lib/python3.7/site-packages/torchvision/datasets/mnist.py:480: UserWarning: The given NumPy array is not writeable, and PyTorch does not support non-writeable tensors. This means you can write to the underlying (supposedly non-writeable) NumPy array using the tensor. You may want to copy the array to protect its data or make it writeable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at  /pytorch/torch/csrc/utils/tensor_numpy.cpp:141.)
  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)

# show some training images
def imshow(img, plot):
    img = img / 2 + 0.5  # unnormalize
    npimg = img.numpy()   # convert from tensor
    plot.imshow(numpy.transpose(npimg, (1, 2, 0))) 
    
# show some training images
plt.figure(figsize=(16, 4))

# fetch a batch of train images; RANDOM
image_batch, label_batch = next(iter(train_loader))

for i in range(20):
    image = image_batch[i]
    label = label_batch[i].item()
    plt.subplot(2, 10, i + 1)
    #image, label = train_loader.dataset.__getitem__(i)
    #plt.imshow(image.squeeze().numpy())
    imshow(image,plt)
    plt.axis('off')
    plt.title(classNames[label])

A 2-hidden layer Fully Connected Neural Network¶

Helper functions for training and testing¶

# function to count number of parameters
def get_n_params(model):
    np=0
    for p in list(model.parameters()):
        np += p.nelement()
    return np

accuracy_list = []
# we pass a model object to this trainer, and it trains this model for one epoch
def train(epoch, model):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        # send to device
        data, target = data.to(device), target.to(device)
        
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
            
def test(model):
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        # send to device
        data, target = data.to(device), target.to(device)
        
        output = model(data)
        test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability                                                                 
        correct += pred.eq(target.data.view_as(pred)).cpu().sum().item()

    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)
    accuracy_list.append(accuracy)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        accuracy))

Defining the Fully Connected Network¶

class FC2Layer(nn.Module):
    def __init__(self, input_size, output_size):
        super(FC2Layer, self).__init__()
        self.input_size = input_size
        self.network = nn.Sequential(
            nn.Linear(input_size, 50), 
            nn.ReLU(), 
            nn.Linear(50,50),
            nn.ReLU(),
            nn.Linear(50,30), 
            nn.ReLU(), 
            nn.Linear(30, output_size), 
            nn.LogSoftmax(dim=1)
        )

    def forward(self, x):
        x = x.view(-1, self.input_size)
        return self.network(x)

Train the Network¶

print("Training on ", device)
model_fnn = FC2Layer(input_size, output_size)
model_fnn.to(device)
optimizer = optim.SGD(model_fnn.parameters(), lr=0.1)
print('Number of parameters: {}'.format(get_n_params(model_fnn)))

for epoch in range(0, 5):
    train(epoch, model_fnn)
    test(model_fnn)

Training on  cuda:0
Number of parameters: 43640
Train Epoch: 0 [0/60000 (0%)]	Loss: 2.313424
Train Epoch: 0 [6400/60000 (11%)]	Loss: 0.470507
Train Epoch: 0 [12800/60000 (21%)]	Loss: 0.653937
Train Epoch: 0 [19200/60000 (32%)]	Loss: 0.937303
Train Epoch: 0 [25600/60000 (43%)]	Loss: 0.222327
Train Epoch: 0 [32000/60000 (53%)]	Loss: 0.112687
Train Epoch: 0 [38400/60000 (64%)]	Loss: 0.167305
Train Epoch: 0 [44800/60000 (75%)]	Loss: 0.044606
Train Epoch: 0 [51200/60000 (85%)]	Loss: 0.070055
Train Epoch: 0 [57600/60000 (96%)]	Loss: 0.152104

Test set: Average loss: 0.1709, Accuracy: 9470/10000 (95%)

Train Epoch: 1 [0/60000 (0%)]	Loss: 0.225331
Train Epoch: 1 [6400/60000 (11%)]	Loss: 0.059543
Train Epoch: 1 [12800/60000 (21%)]	Loss: 0.216941
Train Epoch: 1 [19200/60000 (32%)]	Loss: 0.212808
Train Epoch: 1 [25600/60000 (43%)]	Loss: 0.190679
Train Epoch: 1 [32000/60000 (53%)]	Loss: 0.044835
Train Epoch: 1 [38400/60000 (64%)]	Loss: 0.084212
Train Epoch: 1 [44800/60000 (75%)]	Loss: 0.049855
Train Epoch: 1 [51200/60000 (85%)]	Loss: 0.067796
Train Epoch: 1 [57600/60000 (96%)]	Loss: 0.032366

Test set: Average loss: 0.1239, Accuracy: 9630/10000 (96%)

Train Epoch: 2 [0/60000 (0%)]	Loss: 0.085940
Train Epoch: 2 [6400/60000 (11%)]	Loss: 0.034014
Train Epoch: 2 [12800/60000 (21%)]	Loss: 0.131373
Train Epoch: 2 [19200/60000 (32%)]	Loss: 0.037048
Train Epoch: 2 [25600/60000 (43%)]	Loss: 0.020907
Train Epoch: 2 [32000/60000 (53%)]	Loss: 0.123831
Train Epoch: 2 [38400/60000 (64%)]	Loss: 0.064939
Train Epoch: 2 [44800/60000 (75%)]	Loss: 0.080890
Train Epoch: 2 [51200/60000 (85%)]	Loss: 0.047017
Train Epoch: 2 [57600/60000 (96%)]	Loss: 0.149251

Test set: Average loss: 0.1010, Accuracy: 9696/10000 (97%)

Train Epoch: 3 [0/60000 (0%)]	Loss: 0.085104
Train Epoch: 3 [6400/60000 (11%)]	Loss: 0.135341
Train Epoch: 3 [12800/60000 (21%)]	Loss: 0.097336
Train Epoch: 3 [19200/60000 (32%)]	Loss: 0.057110
Train Epoch: 3 [25600/60000 (43%)]	Loss: 0.034733
Train Epoch: 3 [32000/60000 (53%)]	Loss: 0.060860
Train Epoch: 3 [38400/60000 (64%)]	Loss: 0.051481
Train Epoch: 3 [44800/60000 (75%)]	Loss: 0.073642
Train Epoch: 3 [51200/60000 (85%)]	Loss: 0.124473
Train Epoch: 3 [57600/60000 (96%)]	Loss: 0.043673

Test set: Average loss: 0.0898, Accuracy: 9718/10000 (97%)

Train Epoch: 4 [0/60000 (0%)]	Loss: 0.041336
Train Epoch: 4 [6400/60000 (11%)]	Loss: 0.076606
Train Epoch: 4 [12800/60000 (21%)]	Loss: 0.072943
Train Epoch: 4 [19200/60000 (32%)]	Loss: 0.087108
Train Epoch: 4 [25600/60000 (43%)]	Loss: 0.024005
Train Epoch: 4 [32000/60000 (53%)]	Loss: 0.080626
Train Epoch: 4 [38400/60000 (64%)]	Loss: 0.029655
Train Epoch: 4 [44800/60000 (75%)]	Loss: 0.061492
Train Epoch: 4 [51200/60000 (85%)]	Loss: 0.272095
Train Epoch: 4 [57600/60000 (96%)]	Loss: 0.047510

Test set: Average loss: 0.0846, Accuracy: 9742/10000 (97%)

Show some predictions of the test network¶

def visualize_pred(img, pred_prob, real_label):
    ''' Function for viewing an image and it's predicted classes.
    '''
    #pred_prob = pred_prob.data.numpy().squeeze()

    fig, (ax1, ax2) = plt.subplots(figsize=(6,9), ncols=2)
    #ax1.imshow(img.numpy().squeeze())
    imshow(img, ax1)
    ax1.axis('off')
    pred_label = numpy.argmax(pred_prob)
    ax1.set_title([classNames[real_label], classNames[pred_label]])
    
    ax2.barh(numpy.arange(10), pred_prob)
    ax2.set_aspect(0.1)
    ax2.set_yticks(numpy.arange(10))
    ax2.set_yticklabels(numpy.arange(10))
    ax2.set_title('Prediction Probability')
    ax2.set_xlim(0, 1.1)
    plt.tight_layout()

model_fnn.to('cpu') 

# fetch a batch of test images
image_batch, label_batch = next(iter(test_loader))

# Turn off gradients to speed up this part
with torch.no_grad():
    log_pred_prob_batch = model_fnn(image_batch)
for i in range(10):
    img = image_batch[i]
    real_label = label_batch[i].item()
    log_pred_prob = log_pred_prob_batch[i]
    # Output of the network are log-probabilities, need to take exponential for probabilities
    pred_prob = torch.exp(log_pred_prob).data.numpy().squeeze()
    visualize_pred(img, pred_prob, real_label)

Network with Dropout¶

class FC2LayerDropout(nn.Module):
    def __init__(self, input_size, output_size):
        super(FC2LayerDropout, self).__init__()
        self.input_size = input_size
        self.network = nn.Sequential(
            nn.Linear(input_size, 50),
            nn.Dropout(0.2),
            nn.ReLU(), 
            nn.Linear(50, 50),
            nn.Dropout(0.2),
            nn.ReLU(), 
            nn.Linear(50,30),
            nn.ReLU(),
            nn.Linear(30, output_size), 
            nn.LogSoftmax(dim=1)
        )

    def forward(self, x):
        x = x.view(-1, self.input_size)
        return self.network(x)

print("With Dropout Training on ", device)
model = FC2LayerDropout(input_size, output_size)
model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.1)
print('Number of parameters: {}'.format(get_n_params(model)))

for epoch in range(0, 5):
    model.train() # model in training mode. Turns on dropout, batch-norm etc during training
    train(epoch, model)
    model.eval() # model in evaluation mode. Turn off dropout, batch-norm etc during validation/testing
    test(model)

With Dropout Training on  cuda:0
Number of parameters: 43640
Train Epoch: 0 [0/60000 (0%)]	Loss: 2.305173
Train Epoch: 0 [6400/60000 (11%)]	Loss: 0.993270
Train Epoch: 0 [12800/60000 (21%)]	Loss: 0.736671
Train Epoch: 0 [19200/60000 (32%)]	Loss: 0.361618
Train Epoch: 0 [25600/60000 (43%)]	Loss: 0.357337
Train Epoch: 0 [32000/60000 (53%)]	Loss: 0.377909
Train Epoch: 0 [38400/60000 (64%)]	Loss: 0.311607
Train Epoch: 0 [44800/60000 (75%)]	Loss: 0.349206
Train Epoch: 0 [51200/60000 (85%)]	Loss: 0.239577
Train Epoch: 0 [57600/60000 (96%)]	Loss: 0.118508

Test set: Average loss: 0.2089, Accuracy: 9356/10000 (94%)

Train Epoch: 1 [0/60000 (0%)]	Loss: 0.268046
Train Epoch: 1 [6400/60000 (11%)]	Loss: 0.218968
Train Epoch: 1 [12800/60000 (21%)]	Loss: 0.348738
Train Epoch: 1 [19200/60000 (32%)]	Loss: 0.297094
Train Epoch: 1 [25600/60000 (43%)]	Loss: 0.195412
Train Epoch: 1 [32000/60000 (53%)]	Loss: 0.168145
Train Epoch: 1 [38400/60000 (64%)]	Loss: 0.320052
Train Epoch: 1 [44800/60000 (75%)]	Loss: 0.223455
Train Epoch: 1 [51200/60000 (85%)]	Loss: 0.063304
Train Epoch: 1 [57600/60000 (96%)]	Loss: 0.257579

Test set: Average loss: 0.1411, Accuracy: 9572/10000 (96%)

Train Epoch: 2 [0/60000 (0%)]	Loss: 0.252719
Train Epoch: 2 [6400/60000 (11%)]	Loss: 0.267130
Train Epoch: 2 [12800/60000 (21%)]	Loss: 0.042713
Train Epoch: 2 [19200/60000 (32%)]	Loss: 0.120674
Train Epoch: 2 [25600/60000 (43%)]	Loss: 0.289570
Train Epoch: 2 [32000/60000 (53%)]	Loss: 0.403999
Train Epoch: 2 [38400/60000 (64%)]	Loss: 0.234887
Train Epoch: 2 [44800/60000 (75%)]	Loss: 0.281310
Train Epoch: 2 [51200/60000 (85%)]	Loss: 0.232480
Train Epoch: 2 [57600/60000 (96%)]	Loss: 0.308880

Test set: Average loss: 0.1443, Accuracy: 9564/10000 (96%)

Train Epoch: 3 [0/60000 (0%)]	Loss: 0.155252
Train Epoch: 3 [6400/60000 (11%)]	Loss: 0.048896
Train Epoch: 3 [12800/60000 (21%)]	Loss: 0.104077
Train Epoch: 3 [19200/60000 (32%)]	Loss: 0.078637
Train Epoch: 3 [25600/60000 (43%)]	Loss: 0.312654
Train Epoch: 3 [32000/60000 (53%)]	Loss: 0.683030
Train Epoch: 3 [38400/60000 (64%)]	Loss: 0.134421
Train Epoch: 3 [44800/60000 (75%)]	Loss: 0.135105
Train Epoch: 3 [51200/60000 (85%)]	Loss: 0.120443
Train Epoch: 3 [57600/60000 (96%)]	Loss: 0.172836

Test set: Average loss: 0.1236, Accuracy: 9633/10000 (96%)

Train Epoch: 4 [0/60000 (0%)]	Loss: 0.231828
Train Epoch: 4 [6400/60000 (11%)]	Loss: 0.155570
Train Epoch: 4 [12800/60000 (21%)]	Loss: 0.168062
Train Epoch: 4 [19200/60000 (32%)]	Loss: 0.273018
Train Epoch: 4 [25600/60000 (43%)]	Loss: 0.040724
Train Epoch: 4 [32000/60000 (53%)]	Loss: 0.279445
Train Epoch: 4 [38400/60000 (64%)]	Loss: 0.135829
Train Epoch: 4 [44800/60000 (75%)]	Loss: 0.172820
Train Epoch: 4 [51200/60000 (85%)]	Loss: 0.085956
Train Epoch: 4 [57600/60000 (96%)]	Loss: 0.076415

Test set: Average loss: 0.1232, Accuracy: 9630/10000 (96%)

model.to('cpu') 

# fetch a batch of test images
image_batch, label_batch = next(iter(test_loader))

# Turn off gradients to speed up this part
with torch.no_grad():
    log_pred_prob_batch = model(image_batch)
for i in range(10):
    img = image_batch[i]
    real_label = label_batch[i].item()
    log_pred_prob = log_pred_prob_batch[i]
    # Output of the network are log-probabilities, need to take exponential for probabilities
    pred_prob = torch.exp(log_pred_prob).data.numpy().squeeze()
    visualize_pred(img, pred_prob, real_label)

Does the Fully Connected Network use "Visual Information" ?¶

fixed_perm = torch.randperm(784) # Fix a permutation of the image pixels; We apply the same permutation to all images

# show some training images
plt.figure(figsize=(8, 8))

# fetch a batch of train images; RANDOM
image_batch, label_batch = next(iter(train_loader))

for i in range(6):
    image = image_batch[i]
    image_perm = image.view(-1, 28*28).clone()
    image_perm = image_perm[:, fixed_perm]
    image_perm = image_perm.view(-1, 1, 28, 28)
    
    label = label_batch[i].item()
    plt.subplot(3,4 , 2*i + 1)
    #image, label = train_loader.dataset.__getitem__(i)
    plt.imshow(image.squeeze().numpy())
    plt.axis('off')
    plt.title(label)
    plt.subplot(3, 4, 2*i+2)
    plt.imshow(image_perm.squeeze().numpy())
    plt.axis('off')
    plt.title(label)

accuracy_list = []

def scramble_train(epoch, model, perm=torch.arange(0, 784).long()):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        # send to device
        data, target = data.to(device), target.to(device)
        
        # permute pixels
        data = data.view(-1, 28*28)
        data = data[:, perm]
        data = data.view(-1, 1, 28, 28)

        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
            
def scramble_test(model, perm=torch.arange(0, 784).long()):
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        # send to device
        data, target = data.to(device), target.to(device)
        
        # permute pixels
        data = data.view(-1, 28*28)
        data = data[:, perm]
        data = data.view(-1, 1, 28, 28)
        
        output = model(data)
        test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss                                                               
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability                                                                 
        correct += pred.eq(target.data.view_as(pred)).cpu().sum().item()

    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)
    accuracy_list.append(accuracy)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        accuracy))

print("Training on ", device)
model_fnn_2 = FC2Layer(input_size, output_size)
model_fnn_2.to(device)
optimizer = optim.SGD(model_fnn_2.parameters(), lr=0.1)
print('Number of parameters: {}'.format(get_n_params(model_fnn_2)))

for epoch in range(0, 5):
    scramble_train(epoch, model_fnn_2, fixed_perm)
    scramble_test(model_fnn_2, fixed_perm)

Training on  cuda:0
Number of parameters: 43640
Train Epoch: 0 [0/60000 (0%)]	Loss: 2.301478
Train Epoch: 0 [6400/60000 (11%)]	Loss: 0.459582
Train Epoch: 0 [12800/60000 (21%)]	Loss: 0.339760
Train Epoch: 0 [19200/60000 (32%)]	Loss: 0.477881
Train Epoch: 0 [25600/60000 (43%)]	Loss: 0.221891
Train Epoch: 0 [32000/60000 (53%)]	Loss: 0.118870
Train Epoch: 0 [38400/60000 (64%)]	Loss: 0.116811
Train Epoch: 0 [44800/60000 (75%)]	Loss: 0.163043
Train Epoch: 0 [51200/60000 (85%)]	Loss: 0.183871
Train Epoch: 0 [57600/60000 (96%)]	Loss: 0.139158

Test set: Average loss: 0.1798, Accuracy: 9421/10000 (94%)

Train Epoch: 1 [0/60000 (0%)]	Loss: 0.237787
Train Epoch: 1 [6400/60000 (11%)]	Loss: 0.094658
Train Epoch: 1 [12800/60000 (21%)]	Loss: 0.180873
Train Epoch: 1 [19200/60000 (32%)]	Loss: 0.081074
Train Epoch: 1 [25600/60000 (43%)]	Loss: 0.128215
Train Epoch: 1 [32000/60000 (53%)]	Loss: 0.112602
Train Epoch: 1 [38400/60000 (64%)]	Loss: 0.051747
Train Epoch: 1 [44800/60000 (75%)]	Loss: 0.227443
Train Epoch: 1 [51200/60000 (85%)]	Loss: 0.116866
Train Epoch: 1 [57600/60000 (96%)]	Loss: 0.078934

Test set: Average loss: 0.1331, Accuracy: 9600/10000 (96%)

Train Epoch: 2 [0/60000 (0%)]	Loss: 0.070105
Train Epoch: 2 [6400/60000 (11%)]	Loss: 0.104417
Train Epoch: 2 [12800/60000 (21%)]	Loss: 0.132643
Train Epoch: 2 [19200/60000 (32%)]	Loss: 0.098219
Train Epoch: 2 [25600/60000 (43%)]	Loss: 0.036265
Train Epoch: 2 [32000/60000 (53%)]	Loss: 0.077203
Train Epoch: 2 [38400/60000 (64%)]	Loss: 0.118948
Train Epoch: 2 [44800/60000 (75%)]	Loss: 0.117086
Train Epoch: 2 [51200/60000 (85%)]	Loss: 0.085482
Train Epoch: 2 [57600/60000 (96%)]	Loss: 0.158125

Test set: Average loss: 0.1035, Accuracy: 9669/10000 (97%)

Train Epoch: 3 [0/60000 (0%)]	Loss: 0.065813
Train Epoch: 3 [6400/60000 (11%)]	Loss: 0.035037
Train Epoch: 3 [12800/60000 (21%)]	Loss: 0.036673
Train Epoch: 3 [19200/60000 (32%)]	Loss: 0.031205
Train Epoch: 3 [25600/60000 (43%)]	Loss: 0.050951
Train Epoch: 3 [32000/60000 (53%)]	Loss: 0.067727
Train Epoch: 3 [38400/60000 (64%)]	Loss: 0.040516
Train Epoch: 3 [44800/60000 (75%)]	Loss: 0.128573
Train Epoch: 3 [51200/60000 (85%)]	Loss: 0.023336
Train Epoch: 3 [57600/60000 (96%)]	Loss: 0.088950

Test set: Average loss: 0.1183, Accuracy: 9638/10000 (96%)

Train Epoch: 4 [0/60000 (0%)]	Loss: 0.024631
Train Epoch: 4 [6400/60000 (11%)]	Loss: 0.078783
Train Epoch: 4 [12800/60000 (21%)]	Loss: 0.018310
Train Epoch: 4 [19200/60000 (32%)]	Loss: 0.069943
Train Epoch: 4 [25600/60000 (43%)]	Loss: 0.116712
Train Epoch: 4 [32000/60000 (53%)]	Loss: 0.029171
Train Epoch: 4 [38400/60000 (64%)]	Loss: 0.020269
Train Epoch: 4 [44800/60000 (75%)]	Loss: 0.041994
Train Epoch: 4 [51200/60000 (85%)]	Loss: 0.107654
Train Epoch: 4 [57600/60000 (96%)]	Loss: 0.042692

Test set: Average loss: 0.1173, Accuracy: 9635/10000 (96%)

model_fnn_2.to('cpu') 

# fetch a batch of test images
image_batch, label_batch = next(iter(test_loader))
image_batch_scramble = image_batch.view(-1, 28*28)
image_batch_scramble = image_batch_scramble[:, fixed_perm]
image_batch_scramble = image_batch_scramble.view(-1, 1, 28, 28)
# Turn off gradients to speed up this part
with torch.no_grad():
    log_pred_prob_batch = model_fnn_2(image_batch_scramble)
for i in range(10):
    img = image_batch[i]
    img_perm = image_batch_scramble[i]
    real_label = label_batch[i].item()
    log_pred_prob = log_pred_prob_batch[i]
    # Output of the network are log-probabilities, need to take exponential for probabilities
    pred_prob = torch.exp(log_pred_prob).data.numpy().squeeze()
    visualize_pred(img_perm, pred_prob, real_label)