import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

# read in all the words
words = open('../data/names.txt', 'r').read().splitlines()
print(len(words))
print(max(len(w) for w in words))
print(words[:8])

32033
15
['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27

# shuffle up the words
import random
random.seed(42)
random.shuffle(words)

# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):
    X, Y = [], []

    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix] # crop and append

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y


n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])

for x,y in zip(Xtr[:20], Ytr[:20]):
    print(''.join(itos[ix.item()] for ix in x), '-->', itos[y.item()])

... --> y
..y --> u
.yu --> h
yuh --> e
uhe --> n
hen --> g
eng --> .
... --> d
..d --> i
.di --> o
dio --> n
ion --> d
ond --> r
ndr --> e
dre --> .
... --> x
..x --> a
.xa --> v
xav --> i
avi --> e

# Near copy paste of the layers we have developed in Part 3

# -----------------------------------------------------------------------------------------------
class Linear:
  
    def __init__(self, fan_in, fan_out, bias=True):
        self.weight = torch.randn((fan_in, fan_out)) / fan_in**0.5 # note: kaiming init
        self.bias = torch.zeros(fan_out) if bias else None

    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out

    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])

# -----------------------------------------------------------------------------------------------
class BatchNorm1d:

    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        self.training = True
        # parameters (trained with backprop)
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        # buffers (trained with a running 'momentum update')
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def __call__(self, x):
        # calculate the forward pass
        if self.training:
            xmean = x.mean(0, keepdim=True) # batch mean
            xvar = x.var(0, keepdim=True) # batch variance
        else:
            xmean = self.running_mean
            xvar = self.running_var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
        self.out = self.gamma * xhat + self.beta
        # update the buffers
        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

# -----------------------------------------------------------------------------------------------
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    def parameters(self):
        return []

torch.manual_seed(42); # seed rng for reproducibility

# MLP 
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 200 # the number of neurons in the hidden layer of the MLP

C  = torch.randn((vocab_size, n_embd))   
layers = [
  Linear(n_embd * block_size, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  Linear(n_hidden, vocab_size),
]

# parameter init
with torch.no_grad():
    # last layer: make less confident
    layers[-1].weight *= 0.1

parameters = [C] + [p for layer in layers for p in layer.parameters()]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
    p.requires_grad = True

12097

# same optimization as last time
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
    
    # minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y
    
    # forward pass
    emb = C[Xb] # embed the characters into vectors
    x = emb.view(emb.shape[0], -1) # concatenate the vectors
    for layer in layers:
        x = layer(x)
    loss = F.cross_entropy(x, Yb) # loss function
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update simple SGD
    lr = 0.1 if i < 150000 else 0.01 # step learning rate decay
    for p in parameters:
        p.data += -lr * p.grad

    # track stats
    if i % 10000 == 0: # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())
    
#     if i >= 1000:
#         break # AFTER_DEBUG: would take out obviously to run full optimization

      0/ 200000: 3.2966
  10000/ 200000: 2.2322
  20000/ 200000: 2.4111
  30000/ 200000: 2.1004
  40000/ 200000: 2.3157
  50000/ 200000: 2.2104
  60000/ 200000: 1.9653
  70000/ 200000: 1.9767
  80000/ 200000: 2.6738
  90000/ 200000: 2.0837
 100000/ 200000: 2.2730
 110000/ 200000: 1.7491
 120000/ 200000: 2.2891
 130000/ 200000: 2.3443
 140000/ 200000: 2.1731
 150000/ 200000: 1.8246
 160000/ 200000: 1.7614
 170000/ 200000: 2.2419
 180000/ 200000: 2.0803
 190000/ 200000: 2.1326

plt.plot(lossi);

plt.plot(torch.tensor(lossi).view(-1, 1000).mean(1));

# put layers into eval mode (needed for batchnorm especially)
for layer in layers:
    layer.training = False

@torch.no_grad() # this decorator disables gradient tracking
def split_loss(split):
    x,y = {
        'train': (Xtr, Ytr),
        'val': (Xdev, Ydev),
        'test': (Xte, Yte),
    }[split]
    emb = C[x] # (N, block_size, n_embd)
    x = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)
    for layer in layers:
        x = layer(x)
    loss = F.cross_entropy(x, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')

train 2.0583250522613525
val 2.1065289974212646

# sample from the model

for _ in range(20):

    out = []
    context = [0] * block_size # initialize with all ...
    while True:
        # forward pass the neural net
        emb = C[torch.tensor([context])] # (1,block_size,n_embd)
        x = emb.view(emb.shape[0], -1) # concatenate the vectors
        for layer in layers:
            x = layer(x)
        logits = x
        probs = F.softmax(logits, dim=1)
        # sample from the distribution
        ix = torch.multinomial(probs, num_samples=1).item()
        # shift the context window and track the samples
        context = context[1:] + [ix]
        out.append(ix)
        # if we sample the special '.' token, break
        if ix == 0:
            break

    print(''.join(itos[i] for i in out)) # decode and print the generated word

ivon.
fanili.
thoommara.
kelo.
matyn.
leandr.
aleigh.
koldeniah.
prus.
carleen.
jah.
jorra.
alaya.
shonan.
vishylaharia.
juna.
vio.
orven.
mina.
laylee.

# -----------------------------------------------------------------------------------------------
class Embedding:
  
    def __init__(self, num_embeddings, embedding_dim):
        self.weight = torch.randn((num_embeddings, embedding_dim))

    def __call__(self, IX):
        self.out = self.weight[IX]
        return self.out

    def parameters(self):
        return [self.weight]
    
# -----------------------------------------------------------------------------------------------
class Flatten:
    def __call__(self,x):
        self.out = x.view(x.shape[0],-1)
        return self.out

    def parameters(self):
        return []
    
# -----------------------------------------------------------------------------------------------
class Sequential:

    def __init__(self,layers):
        self.layers = layers

    def __call__(self,x):
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return self.out

    def parameters(self):
        # get parameters of all the layers & stretch them out into one list
        return [p for layer in self.layers for p in layer.parameters()]

torch.manual_seed(42); # seed rng for reproducibility

# MLP
n_embed = 10 # dimension of the character embedding
n_hidden = 200 # number of hidden neurons in a layer

model = Sequential([
    Embedding(vocab_size, n_embed),
    Flatten(),
    Linear(n_embed*block_size, n_hidden, bias = False), BatchNorm1d(n_hidden), Tanh(),
    Linear(n_hidden, vocab_size),
])

# parameter init
with torch.no_grad():
    model.layers[-1].weight *= 0.1 # last layer make less confident

parameters = model.parameters()
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
      p.requires_grad = True

12097

# same optimization as last time
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
    
    # minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y
    
    # forward pass
    logits = model(Xb)
    loss = F.cross_entropy(logits, Yb) # loss function
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update simple SGD
    lr = 0.1 if i < 150000 else 0.01 # step learning rate decay
    for p in parameters:
        p.data += -lr * p.grad

    # track stats
    if i % 10000 == 0: # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())
    
#     break

      0/ 200000: 3.2966
  10000/ 200000: 2.2322
  20000/ 200000: 2.4111
  30000/ 200000: 2.1004
  40000/ 200000: 2.3157
  50000/ 200000: 2.2104
  60000/ 200000: 1.9653
  70000/ 200000: 1.9767
  80000/ 200000: 2.6738
  90000/ 200000: 2.0837
 100000/ 200000: 2.2730
 110000/ 200000: 1.7491
 120000/ 200000: 2.2891
 130000/ 200000: 2.3443
 140000/ 200000: 2.1731
 150000/ 200000: 1.8246
 160000/ 200000: 1.7614
 170000/ 200000: 2.2419
 180000/ 200000: 2.0803
 190000/ 200000: 2.1326

plt.plot(torch.tensor(lossi).view(-1, 1000).mean(1));

# put layers into eval mode (needed for batchnorm especially)
for layer in model.layers:
    layer.training = False

@torch.no_grad() # this decorator disables gradient tracking
def split_loss(split):
    x,y = {
        'train': (Xtr, Ytr),
        'val': (Xdev, Ydev),
        'test': (Xte, Yte),
    }[split]
    logits = model(x)
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')

train 2.0583250522613525
val 2.1065289974212646

# sample from the model

for _ in range(20):

    out = []
    context = [0] * block_size # initialize with all ...
    while True:
        # forward pass the neural net
        logits = model(torch.tensor([context]))
        probs = F.softmax(logits, dim=1)
        # sample from the distribution
        ix = torch.multinomial(probs, num_samples=1).item()
        # shift the context window and track the samples
        context = context[1:] + [ix]
        out.append(ix)
        # if we sample the special '.' token, break
        if ix == 0:
            break

    print(''.join(itos[i] for i in out)) # decode and print the generated word

ivon.
fanili.
thoommara.
kelo.
matyn.
leandr.
aleigh.
koldeniah.
prus.
carleen.
jah.
jorra.
alaya.
shonan.
vishylaharia.
juna.
vio.
orven.
mina.
laylee.

# build the dataset
block_size = 8 # context length: how many characters do we take to predict the next one?

def build_dataset(words):
    X, Y = [], []

    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix] # crop and append

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y


n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%

torch.Size([182625, 8]) torch.Size([182625])
torch.Size([22655, 8]) torch.Size([22655])
torch.Size([22866, 8]) torch.Size([22866])

for x,y in zip(Xtr[:20], Ytr[:20]):
    print(''.join(itos[ix.item()] for ix in x), '-->', itos[y.item()])

........ --> y
.......y --> u
......yu --> h
.....yuh --> e
....yuhe --> n
...yuhen --> g
..yuheng --> .
........ --> d
.......d --> i
......di --> o
.....dio --> n
....dion --> d
...diond --> r
..diondr --> e
.diondre --> .
........ --> x
.......x --> a
......xa --> v
.....xav --> i
....xavi --> e

torch.manual_seed(42); # seed rng for reproducibility

# MLP
n_embed = 10 # dimension of the character embedding
n_hidden = 200 # number of hidden neurons in a layer

model = Sequential([
    Embedding(vocab_size, n_embed),
    Flatten(),
    Linear(n_embed*block_size, n_hidden, bias = False), BatchNorm1d(n_hidden), Tanh(),
    Linear(n_hidden, vocab_size),
])

# parameter init
with torch.no_grad():
    model.layers[-1].weight*= 0.1  # make the last layer less confident

parameters = model.parameters()
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
      p.requires_grad = True

22097

# same optimization as last time
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
    
    # minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y
    
    # forward pass
    logits = model(Xb)
    loss = F.cross_entropy(logits, Yb) # loss function
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update simple SGD
    lr = 0.1 if i < 150000 else 0.01 # step learning rate decay
    for p in parameters:
        p.data += -lr * p.grad

    # track stats
    if i % 10000 == 0: # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())
    
#     break

      0/ 200000: 3.2847
  10000/ 200000: 2.0647
  20000/ 200000: 1.9722
  30000/ 200000: 2.0948
  40000/ 200000: 1.9738
  50000/ 200000: 2.1287
  60000/ 200000: 2.3574
  70000/ 200000: 1.9131
  80000/ 200000: 2.0735
  90000/ 200000: 2.0968
 100000/ 200000: 1.4963
 110000/ 200000: 2.1294
 120000/ 200000: 2.2324
 130000/ 200000: 2.2071
 140000/ 200000: 2.2326
 150000/ 200000: 1.8908
 160000/ 200000: 1.6867
 170000/ 200000: 2.0968
 180000/ 200000: 1.7824
 190000/ 200000: 1.9151

plt.plot(torch.tensor(lossi).view(-1, 1000).mean(1));

# put layers into eval mode (needed for batchnorm especially)
for layer in model.layers:
    layer.training = False

@torch.no_grad() # this decorator disables gradient tracking
def split_loss(split):
    x,y = {
        'train': (Xtr, Ytr),
        'val': (Xdev, Ydev),
        'test': (Xte, Yte),
    }[split]
    logits = model(x)
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')

train 1.9163435697555542
val 2.034247636795044

ix = torch.randint(0, Xtr.shape[0], (4,)) # let's look at a batch of just 4 examples
Xb, Yb = Xtr[ix], Ytr[ix] 
logits = model(Xb)
print(Xb.shape)
Xb

torch.Size([4, 8])

tensor([[ 0,  0, 16, 18,  9, 19,  9, 12],
        [ 0,  0,  0, 12,  5, 12,  1, 14],
        [ 0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0, 11,  1, 20,  9]])

model.layers[0].out.shape # output of Embedding layer (batch size, block size, embedding size)

torch.Size([4, 8, 10])

model.layers[1].out.shape # output of Flatten layer (batch size, input dimension=block size * embedding size)

torch.Size([4, 80])

model.layers[2].out.shape # output of 1st Linear layer (batch size, hidden layer neurons)

torch.Size([4, 200])

(torch.randn(4, 80) @ torch.randn(80, 200) + torch.randn(200)).shape

torch.Size([4, 200])

(torch.randn(4, 4, 20) @ torch.randn(20, 200) + torch.randn(200)).shape # linear layer for wavenet 3D

torch.Size([4, 4, 200])

list(range(10))[1::2]

[1, 3, 5, 7, 9]

list(range(10))[::2]

[0, 2, 4, 6, 8]

# mini-batch size of 4: want e to be (4, 4, 20) where consecutive 10-d vectors get concatenated
e = torch.randn(4, 8, 10)

#explicitly concatenate
method1 = torch.cat( [e[:,::2,:], e[:,1::2,:]], dim = 2)

# use .view operation
method2 = e.view(4, 4, 20)

#Check the similarity 
(method1 == method2).all()

tensor(True)

torch.cat([e[:,::2,:], e[:,1::2,:]], dim = 2).shape

torch.Size([4, 4, 20])

 e.view(4, 4, 20).shape, #e.view(4, -1).shape

(torch.Size([4, 4, 20]),)

# -----------------------------------------------------------------------------------------------
class FlattenConsecutive:

    def __init__(self,n):
        self.n = n

    def __call__(self,x):
        B , T , C = x.shape
        x = x.view(B, T//self.n, C*self.n)

        # previous case, full flattening
        if x.shape[1] == 1:
            x = x.squeeze(1)
        self.out = x
        return self.out

    def parameters(self):
        return []

# original network
# n_embd = 10 # the dimensionality of the character embedding vectors
# n_hidden = 300 # the number of neurons in the hidden layer of the MLP
# model = Sequential([
#   Embedding(vocab_size, n_embd),
#   FlattenConsecutive(8), Linear(n_embd * 8, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
#   Linear(n_hidden, vocab_size),
# ])

# hierarchical network
# n_embd = 24 # the dimensionality of the character embedding vectors
# n_hidden = 128 # the number of neurons in the hidden layer of the MLP
# model = Sequential([
#   Embedding(vocab_size, n_embd),
#   FlattenConsecutive(2), Linear(n_embd * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
#   FlattenConsecutive(2), Linear(n_hidden*2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
#   FlattenConsecutive(2), Linear(n_hidden*2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
#   Linear(n_hidden, vocab_size),
# ])

torch.manual_seed(42); # seed rng for reproducibility

# MLP
n_embed = 10 # dimension of the character embedding
n_hidden = 68 # number of hidden neurons in a layer


model = Sequential([    
  Embedding(vocab_size, n_embed),
  FlattenConsecutive(2), Linear(n_embed * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  FlattenConsecutive(2), Linear(n_hidden*2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  FlattenConsecutive(2), Linear(n_hidden*2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  Linear(n_hidden, vocab_size),
])

# parameter init
with torch.no_grad():
    model.layers[-1].weight *= 0.1  # make the last layer less confident

parameters = model.parameters()
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
      p.requires_grad = True

22397

ix = torch.randint(0, Xtr.shape[0], (4,)) # let's look at a batch of just 4 examples
Xb, Yb = Xtr[ix], Ytr[ix] 
logits = model(Xb)
print(Xb.shape)
Xb

torch.Size([4, 8])

tensor([[ 0,  0,  0, 13,  9, 12,  9,  1],
        [ 0,  0,  0,  1, 22,  1, 12, 25],
        [ 0,  0,  0,  0,  0,  0,  0, 10],
        [ 0,  0,  0,  0, 11,  8, 25, 14]])

for layer in model.layers:
    print(layer.__class__.__name__, ':', tuple(layer.out.shape))

Embedding : (4, 8, 10)
FlattenConsecutive : (4, 4, 20)
Linear : (4, 4, 68)
BatchNorm1d : (4, 4, 68)
Tanh : (4, 4, 68)
FlattenConsecutive : (4, 2, 136)
Linear : (4, 2, 68)
BatchNorm1d : (4, 2, 68)
Tanh : (4, 2, 68)
FlattenConsecutive : (4, 136)
Linear : (4, 68)
BatchNorm1d : (4, 68)
Tanh : (4, 68)
Linear : (4, 27)

logits.shape

torch.Size([4, 27])

# same optimization as last time
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
    
    # minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y
    
    # forward pass
    logits = model(Xb)
    loss = F.cross_entropy(logits, Yb) # loss function
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update simple SGD
    lr = 0.1 if i < 150000 else 0.01 # step learning rate decay
    for p in parameters:
        p.data += -lr * p.grad

    # track stats
    if i % 10000 == 0: # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())
    
#     break

      0/ 200000: 3.3068
  10000/ 200000: 2.2027
  20000/ 200000: 2.4151
  30000/ 200000: 2.1973
  40000/ 200000: 1.9878
  50000/ 200000: 2.0935
  60000/ 200000: 2.0564
  70000/ 200000: 1.8841
  80000/ 200000: 1.9829
  90000/ 200000: 1.9339
 100000/ 200000: 1.6718
 110000/ 200000: 2.4960
 120000/ 200000: 2.2628
 130000/ 200000: 2.0323
 140000/ 200000: 2.2900
 150000/ 200000: 1.8916
 160000/ 200000: 2.0737
 170000/ 200000: 2.0248
 180000/ 200000: 2.0343
 190000/ 200000: 1.9166

plt.plot(torch.tensor(lossi).view(-1, 1000).mean(1));

# put layers into eval mode (needed for batchnorm especially)
for layer in model.layers:
    layer.training = False

@torch.no_grad() # this decorator disables gradient tracking
def split_loss(split):
    x,y = {
        'train': (Xtr, Ytr),
        'val': (Xdev, Ydev),
        'test': (Xte, Yte),
    }[split]
    logits = model(x)
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')

train 1.9413423538208008
val 2.028500556945801

e = torch.randn(32, 4, 68)
emean = e.mean((0, 1), keepdim=True)  # 1, 4, 68
evar = e.var((0, 1), keepdim=True)  # 1, 4, 68
ehat = (e - emean) / torch.sqrt(evar + 1e-5) # 32, 4, 68
e.shape

torch.Size([32, 4, 68])

emean.shape

torch.Size([1, 1, 68])

model.layers[3].running_mean.shape # (1, 1, 68)

torch.Size([1, 4, 68])

# -----------------------------------------------------------------------------------------------
class Linear:
  
    def __init__(self, fan_in, fan_out, init_type = "Kaiming", bias=True):
        if init_type == 'Kaiming':
            self.weight = torch.randn((fan_in, fan_out)) / fan_in**0.5 # note: kaiming init
        elif init_type == 'Xavier':
            self.weight = nn.init.xavier_uniform_(torch.randn((fan_in, fan_out)))  # Use Xavier initialization
        self.bias = torch.zeros(fan_out) if bias else None

    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out

    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])

# -----------------------------------------------------------------------------------------------
class BatchNorm1d: # (N, C), (N, L, C)

    def __init__(self, dim, eps=1e-8, momentum=0.1): # eps=1e-8, 1e-5
        self.eps = eps
        self.momentum = momentum
        self.training = True
        # parameters (trained with backprop)
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        # buffers (trained with a running 'momentum update')
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def __call__(self, x):
        # calculate the forward pass
        if self.training:
            if x.ndim == 2:
                dim = 0
            elif x.ndim == 3:
                dim = (0,1)
            xmean = x.mean(dim, keepdim=True) # batch mean
            xvar = x.var(dim, keepdim=True) # batch variance
        else:
            xmean = self.running_mean
            xvar = self.running_var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
        self.out = self.gamma * xhat + self.beta
        # update the buffers
        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

# -----------------------------------------------------------------------------------------------
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    
    def parameters(self):
        return []

# -----------------------------------------------------------------------------------------------
class Embedding:
  
    def __init__(self, num_embeddings, embedding_dim):
        self.weight = torch.randn((num_embeddings, embedding_dim))

    def __call__(self, IX):
        self.out = self.weight[IX]
        return self.out

    def parameters(self):
        return [self.weight]
    
# -----------------------------------------------------------------------------------------------
class FlattenConsecutive:

    def __init__(self,n):
        self.n = n

    def __call__(self,x):
#         print(f"Input shape before flattening: {x.shape}")
        B , T , C = x.shape
        x = x.view(B, T//self.n, C*self.n)    # Adjusted to correctly flatten the feature dimension
#         print(f"Shape after flattening: {x.shape}")

        # previous case, full flattening
        if x.shape[1] == 1:
            x = x.squeeze(1)
        self.out = x
        return self.out

    def parameters(self):
        return []
    
# -----------------------------------------------------------------------------------------------
class Sequential:

    def __init__(self,layers):
        self.layers = layers

    def __call__(self,x):
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return self.out

    def parameters(self):
        # get parameters of all the layers & stretch them out into one list
        return [p for layer in self.layers for p in layer.parameters()]
    
# --------------------------------------------------------------------------------------------------------
class Dropout:

    def __init__(self, p=0.5):
        self.p = p

    def __call__(self, x):
        if self.train:
            mask = torch.bernoulli(torch.ones_like(x) * (1 - self.p))
            self.out = x * mask / (1 - self.p)
        return self.out

    def parameters(self):
        return []

    def train(self):
        self.training = True

#     def eval(self):
#         self.training = False

# MLP
torch.manual_seed(42) # seed rng for reproducibility
n_embed = 10 # dimension of the character embedding
n_hidden = 68 # number of hidden neurons in a layer

model = Sequential([    
  Embedding(vocab_size, n_embed),
  FlattenConsecutive(2), Linear(n_embed * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  FlattenConsecutive(2), Linear(n_hidden*2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  FlattenConsecutive(2), Linear(n_hidden*2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  Linear(n_hidden, vocab_size),
])

# parameter init
with torch.no_grad():
    model.layers[-1].weight*= 0.1  # make the last layer less confident

parameters = model.parameters()
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
      p.requires_grad = True

22397

# same optimization as last time
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
    
    # minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y
    
    # forward pass
    logits = model(Xb)
    loss = F.cross_entropy(logits, Yb) # loss function
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update simple SGD
    lr = 0.1 if i < 150000 else 0.01 # step learning rate decay
    for p in parameters:
        p.data += -lr * p.grad

    # track stats
    if i % 10000 == 0: # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())
    
#     break

      0/ 200000: 3.3142
  10000/ 200000: 2.2095
  20000/ 200000: 2.1495
  30000/ 200000: 2.1232
  40000/ 200000: 1.9628
  50000/ 200000: 2.4183
  60000/ 200000: 1.9633
  70000/ 200000: 1.8959
  80000/ 200000: 2.1884
  90000/ 200000: 1.8160
 100000/ 200000: 1.5932
 110000/ 200000: 2.3607
 120000/ 200000: 2.3665
 130000/ 200000: 1.9098
 140000/ 200000: 2.2288
 150000/ 200000: 1.7120
 160000/ 200000: 1.8514
 170000/ 200000: 1.9996
 180000/ 200000: 2.1041
 190000/ 200000: 1.6968

plt.plot(torch.tensor(lossi).view(-1, 1000).mean(1));

# put layers into eval mode (needed for batchnorm especially)
for layer in model.layers:
    layer.training = False

@torch.no_grad() # this decorator disables gradient tracking
def split_loss(split):
    x,y = {
        'train': (Xtr, Ytr),
        'val': (Xdev, Ydev),
        'test': (Xte, Yte),
    }[split]
    logits = model(x)
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')

train 1.9110225439071655
val 2.0201878547668457

# MLP
torch.manual_seed(42) # seed rng for reproducibility
n_embed = 24 # dimension of the character embedding
n_hidden = 128 # number of hidden neurons in a layer


model = Sequential([    
  Embedding(vocab_size, n_embed),
  FlattenConsecutive(2), Linear(n_embed * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  FlattenConsecutive(2), Linear(n_hidden*2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  FlattenConsecutive(2), Linear(n_hidden*2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  Linear(n_hidden, vocab_size),
])

# parameter init
with torch.no_grad():
    model.layers[-1].weight*= 0.1  # make the last layer less confident

parameters = model.parameters()
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
      p.requires_grad = True

76579

# same optimization as last time
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
    
    # minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y
    
    # forward pass
    logits = model(Xb)
    loss = F.cross_entropy(logits, Yb) # loss function
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update simple SGD
    lr = 0.1 if i < 150000 else 0.01 # step learning rate decay
    for p in parameters:
        p.data += -lr * p.grad

    # track stats
    if i % 10000 == 0: # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())
    
#     break

      0/ 200000: 3.3167
  10000/ 200000: 2.0576
  20000/ 200000: 2.0724
  30000/ 200000: 2.5134
  40000/ 200000: 2.1475
  50000/ 200000: 1.7836
  60000/ 200000: 2.2594
  70000/ 200000: 1.9330
  80000/ 200000: 1.6876
  90000/ 200000: 2.0394
 100000/ 200000: 1.7733
 110000/ 200000: 1.9578
 120000/ 200000: 1.7463
 130000/ 200000: 1.8117
 140000/ 200000: 1.7417
 150000/ 200000: 1.7470
 160000/ 200000: 1.8812
 170000/ 200000: 1.6257
 180000/ 200000: 1.6465
 190000/ 200000: 1.8548

plt.plot(torch.tensor(lossi).view(-1, 1000).mean(1));

# put layers into eval mode (needed for batchnorm especially)
for layer in model.layers:
    layer.training = False

@torch.no_grad() # this decorator disables gradient tracking
def split_loss(split):
    x,y = {
        'train': (Xtr, Ytr),
        'val': (Xdev, Ydev),
        'test': (Xte, Yte),
    }[split]
    logits = model(x)
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')

train 1.7690415382385254
val 1.9936139583587646

# sample from the model

for _ in range(20):

    out = []
    context = [0] * block_size # initialize with all ...
    while True:
        # forward pass the neural net
        logits = model(torch.tensor([context]))
        probs = F.softmax(logits, dim=1)
        # sample from the distribution
        ix = torch.multinomial(probs, num_samples=1).item()
        # shift the context window and track the samples
        context = context[1:] + [ix]
        out.append(ix)
        # if we sample the special '.' token, break
        if ix == 0:
            break

    print(''.join(itos[i] for i in out)) # decode and print the generated word

arlij.
chetta.
heago.
rocklei.
hendrix.
jamylie.
broxin.
denish.
anslibt.
marianah.
astavia.
annayve.
aniah.
jayce.
nodiel.
remita.
niyelle.
jaylene.
aiyan.
aubreana.

for layer in model.layers:
    print(layer.__class__.__name__, ':', tuple(layer.out.shape))

Embedding : (1, 8, 24)
FlattenConsecutive : (1, 4, 48)
Linear : (1, 4, 128)
BatchNorm1d : (1, 4, 128)
Tanh : (1, 4, 128)
FlattenConsecutive : (1, 2, 256)
Linear : (1, 2, 128)
BatchNorm1d : (1, 2, 128)
Tanh : (1, 2, 128)
FlattenConsecutive : (1, 256)
Linear : (1, 128)
BatchNorm1d : (1, 128)
Tanh : (1, 128)
Linear : (1, 27)

_layers = [layer.__class__.__name__ for layer in model.layers]
# _layers

_layer = [f'FlattenConsecutive+Linear' if i == 'FlattenConsecutive' and j == 'Linear' else 
 i if i != 'FlattenConsecutive' else j for i, j in zip(_layers, _layers[1:] + [''])]
#_layer

i = 0
while i < len(_layer):
    if _layer[i] == 'FlattenConsecutive+Linear' and i < len(_layer) - 1 and _layer[i+1] == 'Linear':
        _layer.pop(i+1)
    else:
        i += 1
        
_layer

['Embedding',
 'FlattenConsecutive+Linear',
 'BatchNorm1d',
 'Tanh',
 'FlattenConsecutive+Linear',
 'BatchNorm1d',
 'Tanh',
 'FlattenConsecutive+Linear',
 'BatchNorm1d',
 'Tanh',
 'Linear']

for name, param in zip(_layer, model.parameters()):
    print(f"Parameter Layer: {str(name):25s}   |   shape {param.shape}")

Parameter Layer: Embedding                   |   shape torch.Size([27, 24])
Parameter Layer: FlattenConsecutive+Linear   |   shape torch.Size([48, 128])
Parameter Layer: BatchNorm1d                 |   shape torch.Size([128])
Parameter Layer: Tanh                        |   shape torch.Size([128])
Parameter Layer: FlattenConsecutive+Linear   |   shape torch.Size([256, 128])
Parameter Layer: BatchNorm1d                 |   shape torch.Size([128])
Parameter Layer: Tanh                        |   shape torch.Size([128])
Parameter Layer: FlattenConsecutive+Linear   |   shape torch.Size([256, 128])
Parameter Layer: BatchNorm1d                 |   shape torch.Size([128])
Parameter Layer: Tanh                        |   shape torch.Size([128])
Parameter Layer: Linear                      |   shape torch.Size([128, 27])

for x,y in zip(Xtr[7:15], Ytr[7:15]):
    print(''.join(itos[ix.item()] for ix in x), '-->', itos[y.item()])

........ --> d
.......d --> i
......di --> o
.....dio --> n
....dion --> d
...diond --> r
..diondr --> e
.diondre --> .

Xtr[[7]].shape # convert to list of 7

torch.Size([1, 8])

# forward a single example:
logits = model(Xtr[[7]])
logits.shape

torch.Size([1, 27])

# forward all of them
logits = torch.zeros(8, 27)
for i in range(8):
    logits[i] = model(Xtr[[7+i]])
logits.shape

torch.Size([8, 27])

# calculate all of them as a batch
#logits = model(Xtr[7:15])
#logits.shape

# convolution is a "for loop"
# allows us to forward Linear layers efficiently over space

makemore_WaveNet¶

Table of Contents¶

Appendix¶

Figures¶

Tables¶

Definitions/Explanations¶

Exercises ¶

References ¶

0. Makemore: Introduction ¶

1. Multilayer Perceptron (MLP) Internals¶

1.1. Starter Code ¶

1.2. Fixing Learning Rate Plot ¶

1.3. `PyTorch`-ifying our Code ¶

Text Generation¶

2. WaveNet¶

2.1. Overview ¶

2.2. Case 1: Set Context Size to 8 ¶

2.3. WaveNet Implementation: Hierarchical Tree ¶

2.4. WaveNet: Training --> 1st Pass ¶

2.5. Fixing `BatchNorm1d` Bug ¶

2.6. WaveNet: Training --> 2nd Pass ¶

2.7. WaveNet: Scaling Up ¶

Let's visualize the model layers and parameters' shapes¶

Model Performance Log for Different Scenarios:¶

3. Summary¶

3.1. Experimental Harness ¶

3.2. WaveNet but with "Dilated Causal Convolutions"¶

3.3. `torch.nn`¶

3.4. Development Process of Building Deep Neural Nets (DNNs)¶

3.5. Going Forward: Improvements !!!¶

4. Conclusion¶

Exercises¶

Appendix - Convolutions: Kernel, Stride, Padding, Bias¶

References¶

	Context Size	Kernel Size	Embedding Dimension	Hidden Neurons	Parameters	Train loss	Val. loss
Original	3	-	10	200	12k	2.058	2.106
Case $1$ (Context size: $3\rightarrow8$)	8	-	10	200	22k	1.916	2.034
Case $2a$ (Flat $\rightarrow$ Hierarchical)	8	2	10	68	22k	1.941	2.028
Case $2b$ (Fix bug in `BatchNorm1d`)	8	2	10	68	22k	1.911	2.020
Case $2c$ (Scaling up the network)	8	2	24	128	76k	1.769	1.993

makemore_WaveNet¶

Table of Contents¶

Appendix¶

Figures¶

Tables¶

Definitions/Explanations¶

Exercises¶

References¶

0. Makemore: Introduction¶

1. Multilayer Perceptron (MLP) Internals¶

1.1. Starter Code¶

1.2. Fixing Learning Rate Plot¶

1.3. PyTorch-ifying our Code¶

Text Generation¶

2. WaveNet¶

2.1. Overview¶

2.2. Case 1: Set Context Size to 8¶

2.3. WaveNet Implementation: Hierarchical Tree¶

2.4. WaveNet: Training --> 1st Pass¶

2.5. Fixing BatchNorm1d Bug¶

2.6. WaveNet: Training --> 2nd Pass¶

2.7. WaveNet: Scaling Up¶

Let's visualize the model layers and parameters' shapes¶

Model Performance Log for Different Scenarios:¶

3. Summary¶

3.1. Experimental Harness¶

3.2. WaveNet but with "Dilated Causal Convolutions"¶

3.3. torch.nn¶

3.4. Development Process of Building Deep Neural Nets (DNNs)¶

3.5. Going Forward: Improvements !!!¶

4. Conclusion¶

Exercises¶

Appendix - Convolutions: Kernel, Stride, Padding, Bias¶

References¶

Exercises ¶

References ¶

0. Makemore: Introduction ¶

1.1. Starter Code ¶

1.2. Fixing Learning Rate Plot ¶

1.3. `PyTorch`-ifying our Code ¶

2.1. Overview ¶

2.2. Case 1: Set Context Size to 8 ¶

2.3. WaveNet Implementation: Hierarchical Tree ¶

2.4. WaveNet: Training --> 1st Pass ¶

2.5. Fixing `BatchNorm1d` Bug ¶

2.6. WaveNet: Training --> 2nd Pass ¶

2.7. WaveNet: Scaling Up ¶

3.1. Experimental Harness ¶

3.3. `torch.nn`¶