import math
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

def f(x):
    return 3*x**2 - 4*x + 5

f(3.0)

20.0

xs = np.arange(-5, 5, 0.25)
xs

array([-5.  , -4.75, -4.5 , -4.25, -4.  , -3.75, -3.5 , -3.25, -3.  ,
       -2.75, -2.5 , -2.25, -2.  , -1.75, -1.5 , -1.25, -1.  , -0.75,
       -0.5 , -0.25,  0.  ,  0.25,  0.5 ,  0.75,  1.  ,  1.25,  1.5 ,
        1.75,  2.  ,  2.25,  2.5 ,  2.75,  3.  ,  3.25,  3.5 ,  3.75,
        4.  ,  4.25,  4.5 ,  4.75])

ys = f(xs)
ys

array([100.    ,  91.6875,  83.75  ,  76.1875,  69.    ,  62.1875,
        55.75  ,  49.6875,  44.    ,  38.6875,  33.75  ,  29.1875,
        25.    ,  21.1875,  17.75  ,  14.6875,  12.    ,   9.6875,
         7.75  ,   6.1875,   5.    ,   4.1875,   3.75  ,   3.6875,
         4.    ,   4.6875,   5.75  ,   7.1875,   9.    ,  11.1875,
        13.75  ,  16.6875,  20.    ,  23.6875,  27.75  ,  32.1875,
        37.    ,  42.1875,  47.75  ,  53.6875])

plt.plot(xs, ys)
plt.show()

h = 0.00000001
x = 3.0
(f(x + h) - f(x))/h

14.00000009255109

h = 0.00000001
x = -3.0
(f(x + h) - f(x))/h

-22.00000039920269

h = 0.000001
x = 2/3
(f(x + h) - f(x))/h

2.999378523327323e-06

# lets get more complex
a = 2.0
b = -3.0
c = 10.0
d = a*b + c
print(d)

4.0

h = 0.0001

# inputs
a = 2.0
b = -3.0
c = 10.0

d1 = a*b + c
c += h
d2 = a*b + c

print('d1', d1)
print('d2', d2)
print('slope', (d2 - d1)/h)

d1 4.0
d2 4.0001
slope 0.9999999999976694

class Value:
    """ stores a single scalar value and its gradient """

    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0
        # internal variables used for autograd graph construction
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op # the op that produced this node, for graphviz / debugging / etc
        self.label = label
        
    def __repr__(self):
        return f"Value(data={self.data}, grad={self.grad})"
    
    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')
        
        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad
        out._backward = _backward

        return out
    
    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')
        
        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward
        
        return out
    
    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
        out = Value(self.data**other, (self,), f'**{other}')

        def _backward():
            self.grad += (other * self.data**(other-1)) * out.grad
        out._backward = _backward

        return out
    
    def relu(self):
        out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU')

        def _backward():
            self.grad += (out.data > 0) * out.grad
        out._backward = _backward

        return out
    
    def tanh(self):
        x = self.data
        t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
        out = Value(t, (self,), 'tanh')
        
        def _backward():
            self.grad += (1 - t**2) * out.grad
        out._backward = _backward
        
        return out
    
    def exp(self):
        x = self.data
        t = math.exp(x)
        out = Value(t, (self,), 'exp')
        
        def _backward():
            self.grad += t * out.grad
        out._backward = _backward
        
        return out
    
    def backward(self):
        
        # build the topological ordered list of all the children in the DAG
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)
 
        # We must go through the topogical ordered list in the reverse order as per backpropagation
        # We must go through one variable at a time and
        # then apply the chain rule to get its gradient
        self.grad = 1.0
        for node in reversed(topo):
            node._backward()
    
    def __neg__(self): # -self
        return self * -1

    def __radd__(self, other): # other + self
        return self + other

    def __sub__(self, other): # self - other
        return self + (-other)

    def __rsub__(self, other): # other - self
        return other + (-self)

    def __rmul__(self, other): # other * self
        return self * other

    def __truediv__(self, other): # self / other
        return self * other**-1

    def __rtruediv__(self, other): # other / self
        return other * self**-1

a = Value(2.0,label='a')
b = Value(-3.0,label='b')
c = Value(10.0,label='c')

e = a*b; e.label ='e'
d = e+c; d.label = 'd'
f = Value(-2.0, label='f')
L = d * f
L.label = 'L'
L

Value(data=-8.0, grad=0.0)

print(f"The input nodes of L are {L._prev}")
print(f"The label of L is {L.label}")

The input nodes of L are {Value(data=4.0, grad=0.0), Value(data=-2.0, grad=0.0)}
The label of L is L

L._op

'*'

import sys
print(sys.executable)

/Users/CEO/anaconda3/envs/py36/bin/python

import graphviz
graphviz.__version__, graphviz.version()

('0.19.1', (10, 0, 1))

from graphviz import Digraph

def trace(root):
    """
    builds a set of all nodes and edges in a graph
    """
    nodes, edges = set(), set()
    def build(v):
        if v not in nodes:
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)
    build(root)
    return nodes, edges

def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
    nodes, edges = trace(root)    
    for n in nodes:
        uid = str(id(n))
        # for any value in the graph,create a rectangular ('record') node for it
        dot.node(name = uid, label = "{ %s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad), shape='record')
        if n._op:
            # if this value is a result of some operation, create an op node for it
            dot.node(name = uid + n._op, label=n._op)
            # and connect this node to it
            dot.edge(uid + n._op, uid)
    
    for n1, n2 in edges:
        # connect n1 to the op node of n2
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)
    
    return dot

a = Value(2.0,label='a')
b = Value(-3.0,label='b')
c = Value(10.0,label='c')

e = a*b; e.label ='e'
d = e+c; d.label = 'd'
f = Value(-2.0, label='f')
L = d * f
L.label = 'L'
L

Value(data=-8.0, grad=0.0)

L.grad = 1.0                  # dL/dL
f.grad = d.data               # dL/df
d.grad = f.data               # dL/dd
c.grad = e.grad = d.grad      # dL/dc, dL/de
a.grad = e.grad * b.data      # dL/da
b.grad = e.grad * a.data      # dL/db

draw_dot(L)

def grad_check(var, h=0.001): #inline gradient check
    """
    parameters:
    
    var {string} : a,b,c,d,e,f
    h {float}: step change
    
    adjust a,b,c,d,e,f by h and see how it affects L 
    estimate dL/di, where i =  a,b,c,d,e,f
    """
    a = Value(2.0,label='a')
    b = Value(-3.0,label='b')
    c = Value(10.0,label='c')
    e = a*b; e.label ='e'
    d = e+c; d.label = 'd'
    f = Value(-2.0, label='f')
    L = d * f
    L.label = 'L'
    L1 = L.data
    
    if var=='a':
        a = Value(2.0,label='a')
        a.data += h
        b = Value(-3.0,label='b')
        c = Value(10.0,label='c')
        e = a*b; e.label ='e'
        d = e+c; d.label = 'd'
        f = Value(-2.0, label='f')
        L = d * f 
        L.label = 'L'
        L2 = L.data
    elif var=='b':
        a = Value(2.0,label='a')
        b = Value(-3.0,label='b')
        b.data += h
        c = Value(10.0,label='c')
        e = a*b; e.label ='e'
        d = e+c; d.label = 'd'
        f = Value(-2.0, label='f')
        L = d * f 
        L.label = 'L'
        L2 = L.data
    elif var=='c':
        a = Value(2.0,label='a')
        b = Value(-3.0,label='b')
        c = Value(10.0,label='c')
        c.data += h
        e = a*b; e.label ='e'
        d = e+c; d.label = 'd'
        f = Value(-2.0, label='f')
        L = d * f 
        L.label = 'L'
        L2 = L.data
    elif var=='d':
        a = Value(2.0,label='a')
        b = Value(-3.0,label='b')
        c = Value(10.0,label='c')
        e = a*b; e.label ='e'
        d = e+c; d.label = 'd'
        d.data += h
        f = Value(-2.0, label='f')
        L = d * f 
        L.label = 'L'
        L2 = L.data
    elif var=='e':
        a = Value(2.0,label='a')
        b = Value(-3.0,label='b')
        c = Value(10.0,label='c')
        e = a*b; e.label ='e'
        e.data += h
        d = e+c; d.label = 'd'
        f = Value(-2.0, label='f')
        L = d * f
        L.label = 'L'
        L2 = L.data
    elif var=='f':
        a = Value(2.0,label='a')
        b = Value(-3.0,label='b')
        c = Value(10.0,label='c')
        e = a*b; e.label ='e'
        d = e+c; d.label = 'd'
        f = Value(-2.0, label='f')
        f.data += h
        L = d * f
        L.label = 'L'
        L2 = L.data
    elif var=='L':
        a = Value(2.0,label='a')
        b = Value(-3.0,label='b')
        c = Value(10.0,label='c')
        e = a*b; e.label ='e'
        d = e+c; d.label = 'd'
        f = Value(-2.0, label='f')
        L = d * f
        L.data += h
        L.label = 'L'
        L2 = L.data
    else:
        print("The variable is NOT recognized !!! Ensure its one of these (case-sensitive): a, b, c, d, e, f, L")
        return None
    
    return (L2 - L1)/h

print(f"dL/dL = {grad_check('L'):.3f}")
print(f"dL/dd = {grad_check('d'):.3f}")
print(f"dL/df = {grad_check('f'):.3f}")
print(f"dL/dc = {grad_check('c'):.3f}")
print(f"dL/de = {grad_check('e'):.3f}")
print(f"dL/da = {grad_check('a'):.3f}")
print(f"dL/db = {grad_check('b'):.3f}")

dL/dL = 1.000
dL/dd = -2.000
dL/df = 4.000
dL/dc = -2.000
dL/de = -2.000
dL/da = 6.000
dL/db = -4.000

grad_check('df')

The variable is NOT recognized !!! Ensure its one of these (case-sensitive): a, b, c, d, e, f, L

a.data += 0.01 * a.grad
b.data += 0.01 * b.grad
c.data += 0.01 * c.grad
f.data += 0.01 * f.grad

e = a * b 
d = e + c
L = d * f

print(L.data)

-7.286496

# inputs x1,x2
x1 = Value(2.0,label='x1')
x2 = Value(0.0,label='x2')
# weights w1, w2
w1 = Value(-3.0,label='w1')
w2 = Value(1.0,label='w2')
# bias of the neuron
b = Value(6.8813735870195432,label='b')
# x1*w1 + x2*w2 + b
x1w1 = x1*w1; x1w1.label ='x1w1'
x2w2 = x2*w2; x2w2.label ='x2w2'
x1w1x2w2 = x1w1+x2w2; x1w1x2w2.label='x1w1 + x2w2'
n = x1w1x2w2+b; n.label='n'
o = n.tanh() 
o.label = 'o'

# tanh activation function
plt.plot(np.arange(-5,5,0.2), np.tanh(np.arange(-5,5,0.2)))
plt.grid()

o.grad = 1.0
n.grad = 1 - o.data**2 # o = tanh(n); do/dn = 1 - tanh(n)**2 = 1 - o**2

#addition simply distributes the derivative
b.grad = n.grad
x1w1x2w2.grad = n.grad
x1w1.grad = x1w1x2w2.grad
x2w2.grad = x1w1x2w2.grad

#we only need the derivatives of the weights
x2.grad = x2w2.grad * w2.data
w2.grad = x2w2.grad * x2.data

x1.grad = x1w1.grad * w1.data
w1.grad = x1w1.grad * x1.data

print(f"do/do = {o.grad:.3f}")
print(f"do/dn = {n.grad:.3f}")
print(f"do/db = {b.grad:.3f}")
print(f"do/d(x1w1 + x2w2) = {x1w1x2w2.grad:.3f}")
print(f"do/d(x1w1) = {x1w1.grad:.3f}")
print(f"do/d(x2w2) = {x2w2.grad:.3f}")
print(f"do/x2 = {x2.grad:.3f}")
print(f"do/w2 = {w2.grad:.3f}")
print(f"do/x1 = {x1.grad:.3f}")
print(f"do/w1 = {w1.grad:.3f}")

do/do = 1.000
do/dn = 0.500
do/db = 0.500
do/d(x1w1 + x2w2) = 0.500
do/d(x1w1) = 0.500
do/d(x2w2) = 0.500
do/x2 = 0.500
do/w2 = 0.000
do/x1 = -1.500
do/w1 = 1.000

draw_dot(o)

# inputs x1,x2
x1 = Value(2.0,label='x1')
x2 = Value(0.0,label='x2')
# weights w1, w2
w1 = Value(-3.0,label='w1')
w2 = Value(1.0,label='w2')
# bias of the neuron
b = Value(6.8813735870195432,label='b')
# x1*w1 + x2*w2 + b
x1w1 = x1*w1; x1w1.label ='x1w1'
x2w2 = x2*w2; x2w2.label ='x2w2'
x1w1x2w2 = x1w1+x2w2; x1w1x2w2.label='x1w1 + x2w2'
n = x1w1x2w2+b; n.label='n'
o = n.tanh() 
o.label = 'o'

o.grad = 1.0           #do/do (initialize to 1: agrees with the derivative too)

o._backward()          #do/dn
n._backward()          #do/db = do/d(x1w1+x2w2)
b._backward()          #b is a leaf node, so it does not have a _backward() function
x1w1x2w2._backward()   #do/d(x1w1) = do/d(x2w2)
x2w2._backward()       #do/d(x2), do/d(w2)
x1w1._backward()       #do/d(x1), do/d(w1)
#x1, w1, x2, w2 are leaf nodes, therefore they don't have _backward() functions

draw_dot(o)

# inputs x1,x2
x1 = Value(2.0,label='x1')
x2 = Value(0.0,label='x2')
# weights w1, w2
w1 = Value(-3.0,label='w1')
w2 = Value(1.0,label='w2')
# bias of the neuron
b = Value(6.8813735870195432,label='b')
# x1*w1 + x2*w2 + b
x1w1 = x1*w1; x1w1.label ='x1w1'
x2w2 = x2*w2; x2w2.label ='x2w2'
x1w1x2w2 = x1w1+x2w2; x1w1x2w2.label='x1w1 + x2w2'
n = x1w1x2w2+b; n.label='n'
o = n.tanh() 
o.label = 'o'

o.backward()

draw_dot(o)

a = Value(3.0, label = 'a')
b = a + a ; b.label ='b'
b.backward()
draw_dot(b)

a = Value(-2.0,label='a')
b = Value(3.0,label='b')


d = a * b   ; d.label ='d'
e = a + b   ; e.label = 'e'
f = d * e   ; f.label='f'

f.backward()

draw_dot(f)

a = Value(3.0, label = 'a')
b = a + a ; b.label ='b'
b.backward()
draw_dot(b)

a = Value(-2.0,label='a')
b = Value(3.0,label='b')


d = a * b   ; d.label ='d'
e = a + b   ; e.label = 'e'
f = d * e   ; f.label='f'

f.backward()

draw_dot(f)

a = Value(2.0)
b = Value(4.0)
a+1

Value(data=3.0, grad=0.0)

a*1

Value(data=2.0, grad=0.0)

2*a

Value(data=4.0, grad=0.0)

a.exp()

Value(data=7.38905609893065, grad=0.0)

a/b

Value(data=0.5, grad=0.0)

a * 1/b

Value(data=0.5, grad=0.0)

a * (b**-1)

Value(data=0.5, grad=0.0)

a-b

Value(data=-2.0, grad=0.0)

# inputs x1,x2
x1 = Value(2.0,label='x1')
x2 = Value(0.0,label='x2')
# weights w1, w2
w1 = Value(-3.0,label='w1')
w2 = Value(1.0,label='w2')
# bias of the neuron
b = Value(6.8813735870195432,label='b')
# x1*w1 + x2*w2 + b
x1w1 = x1*w1; x1w1.label ='x1w1'
x2w2 = x2*w2; x2w2.label ='x2w2'
x1w1x2w2 = x1w1+x2w2; x1w1x2w2.label='x1w1 + x2w2'
n = x1w1x2w2+b; n.label='n'
o = n.tanh() 
o.label = 'o'
o.backward()

draw_dot(o)

# inputs x1,x2
x1 = Value(2.0,label='x1')
x2 = Value(0.0,label='x2')
# weights w1, w2
w1 = Value(-3.0,label='w1')
w2 = Value(1.0,label='w2')
# bias of the neuron
b = Value(6.8813735870195432,label='b')
# x1*w1 + x2*w2 + b
x1w1 = x1*w1; x1w1.label ='x1w1'
x2w2 = x2*w2; x2w2.label ='x2w2'
x1w1x2w2 = x1w1+x2w2; x1w1x2w2.label='x1w1 + x2w2'
n = x1w1x2w2+b; n.label='n'
# -----------
e = (2*n).exp()
o = (e - 1)/(e + 1)
# ------------
o.label = 'o'
o.backward()

draw_dot(o)

import sys
print(sys.executable)

/Users/CEO/anaconda3/envs/py36/bin/python

import torch

torch.Tensor([[1, 2, 3], [4, 5, 6]]),\
torch.Tensor([[1, 2, 3], [4, 5, 6]]).shape

(tensor([[1., 2., 3.],
         [4., 5., 6.]]),
 torch.Size([2, 3]))

torch.tensor([2.0]).dtype,\
torch.tensor([2.0]).double().dtype

(torch.float32, torch.float64)

x1 = torch.tensor([2.0]).double()     ;x1.requires_grad=True
w1 = torch.tensor([-3.0]).double()    ;w1.requires_grad=True
x2 = torch.tensor([0.0]).double()     ;x2.requires_grad=True
w2 = torch.tensor([1.0]).double()     ;w2.requires_grad=True
b  = torch.tensor([6.8813735870195432]).double()     ;b.requires_grad=True
n = w1*x1 + w2*x2 + b
o = torch.tanh(n)

print(o.data.item())
o.backward()

print('-----')
print('x2', x2.grad.item())
print('w2', w2.grad.item())
print('x1', x1.grad.item())
print('w1', w1.grad.item())

0.7071066904050358
-----
x2 0.5000001283844369
w2 0.0
x1 -1.5000003851533106
w1 1.0000002567688737

o, o.item(), o.data.item()

(tensor([0.7071], dtype=torch.float64, grad_fn=<TanhBackward0>),
 0.7071066904050358,
 0.7071066904050358)

import random

class Neuron:
    """ Defines a single neuron"""

    def __init__(self, nin):
        """ Assign random weights and biases to the inputs 
        Parameters
        ----------
        nin: int
            inputs to the neuron
        """
        self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
        self.b = Value(random.uniform(-1,1))

    def __call__(self, x): # w * x + b
        """ Calculate the output of the neuron
        Parameters
        ----------
        x: list
            input values to the neuron

        Returns
        ----------
        integer
            output of the neuron
        """
        act = sum((wi*xi for wi, xi in zip(self.w, x)),  self.b)
        out = act.tanh()
        return out
    
    def parameters(self):
        return self.w + [self.b]
    
    
class Layer:
    """ Defines a layer of neurons """

    def __init__(self, nin, nout):
        """ Initialize the neurons of the layer
        Parameters
        ----------
        nin: int
            number of inputs to the layer
        nout: int
            number of neurons in the layer
        """
        self.neurons = [Neuron(nin) for _ in range(nout)]
  
    def __call__(self, x):
        """ Calculate the outputs of a layer
        Parameters
        ----------
        x: list
            input values to the neuron

        Returns
        ----------
        int
            outputs of the neuron
        """
        outs = [n(x) for n in self.neurons]
        return outs[0] if len(outs) == 1 else outs
    
    def parameters(self):
        return [param for neuron in self.neurons for param in neuron.parameters()]
    
    
class MLP:
    """ Defines a multilayer perceptron"""
    
    def __init__(self, nin, nouts): # nout define the sizes of all layers
        """ Initialize the layers of MLP
        Parameters
        ----------
        nin: int
            number of inputs to the MLP
        nouts: list
            number of neurons in each layer
        """
        sz = [nin] + nouts
        self.layers= [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]

    def __call__(self, x):
        """ Calculate the outputs of a layer sequentially
        Parameters
        ----------
        x: list
            input values to the MLP

        Returns
        ----------
        list
            outputs of the MLP 
        """
        for layer in self.layers:
            x = layer(x)
        return x
    
    def parameters(self):
        return [param for layer in self.layers for param in layer.parameters()]

x = [2.0, 3.0]
a = Neuron(2)
a(x)

Value(data=0.9105848567129079, grad=0.0)

x = [2.0, 3.0]
b =  Layer(2, 3)
b(x)

[Value(data=0.9835475868478987, grad=0.0),
 Value(data=-0.18080027900666776, grad=0.0),
 Value(data=-0.36304820207442917, grad=0.0)]

# inputs to the nn
x = [2.0, 3.0, -1.0]

# initialize the MLP
n = MLP(3, [4, 4, 1])

# run the NN
n(x)

Value(data=-0.03132176728538012, grad=0.0)

len(n.parameters())

41

xs =[
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0]
]

#target values (simple binary classification)
ys =[1.0, -1.0, -1.0, 1.0]

#run the neural network on all the inputs
# ypred = [n(x) for x in xs]
# ypred

ypred = [n(x) for x in xs]
loss = 0
for ygt, yout in zip(ys, ypred):
    loss += (yout - ygt)**2 
print(loss)

Value(data=5.625114897319604, grad=0.0)

'''loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred)) # ygt = y_ground_truth, yout = y_output
loss''' #it affects operations due to Value object representation of parameters (error: int + Value)

'loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred)) # ygt = y_ground_truth, yout = y_output\nloss'

n.layers[0].neurons[0].w[0].grad

0.0

n.layers[0].neurons[0].w[0].data

-0.3188397214186136

ypred = [n(x) for x in xs]
loss = 0
for ygt, yout in zip(ys, ypred):
    loss += (yout - ygt)**2 
print(loss)

Value(data=5.625114897319604, grad=0.0)

loss.backward()

for p in n.parameters():
    p.data += -0.05 * p.grad

ypred = [n(x) for x in xs]
ypred

[Value(data=0.5438762632690177, grad=0.0),
 Value(data=0.5956186061958746, grad=0.0),
 Value(data=-0.4498238385421293, grad=0.0),
 Value(data=0.5037095702051143, grad=0.0)]

x = [2.0, 3.0, -1.0]
n = MLP(3, [4, 4, 1])
n(x)

Value(data=-0.6041503111538816, grad=0.0)

xs =[
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0]
]

#target values (simple binary classification)
ys =[1.0, -1.0, -1.0, 1.0]

def train_MLP(n_iter=20, learning_rate=0.01):
    losses, steps = [], []
    step = 0
    for k in range(n_iter):
        step += 1
        steps.append(step)
        # forward pass
        ypred = [n(x) for x in xs]
        loss = 0
        for ygt, yout in zip(ys, ypred):
            loss += (yout - ygt)**2 
        total_loss = loss
        losses.append(total_loss.data)

        # backward pass -> reset gradients to zero so as to
        # ensure the actual backward pass accumulates the loss derivatives into the gradients
        for p in n.parameters():
            p.grad = 0.0
        total_loss.backward()

        # update parameters
        for p in n.parameters():
            p.data += -1 * learning_rate * p.grad

        print(k, total_loss.data)
    print(ypred)
    return [steps, losses]

_train_loss = train_MLP(40, 0.05)

0 5.485189247043687
1 3.3421234365691754
2 2.584049020397665
3 1.9610887856041734
4 1.4208451979993688
5 0.9652687705981344
6 0.6646346234265398
7 0.47428422790045377
8 0.3615123589435135
9 0.2870300719366895
10 0.23489049651030347
11 0.19691204612563087
12 0.16831557271479816
13 0.14617933935615032
14 0.12864178050918224
15 0.1144722088955907
16 0.10283027925617594
17 0.09312577909436154
18 0.08493384323191937
19 0.07794195214308396
20 0.071915798959111
21 0.066676715938482
22 0.06208638656923887
23 0.05803626867652136
24 0.0544401349246291
25 0.051228719740150565
26 0.048345816790658386
27 0.04574539276626418
28 0.043389424538740215
29 0.041246258705737185
30 0.03928935342896755
31 0.037496303502279935
32 0.03584807765559297
33 0.03432841658324537
34 0.032923353888887105
35 0.03162083189798568
36 0.03041039131962961
37 0.029282918859381307
38 0.028230440651596557
39 0.027245952177094315
[Value(data=0.9311099191634069, grad=-0.13778016167318619), Value(data=-0.9800438376407024, grad=0.03991232471859529), Value(data=-0.8834365990471585, grad=0.2331268019056829), Value(data=0.90772414139322, grad=-0.18455171721356)]

ypred = [n(x) for x in xs]
ypred

[Value(data=0.9322752068220904, grad=0.0),
 Value(data=-0.9804343270784147, grad=0.0),
 Value(data=-0.885414663220842, grad=0.0),
 Value(data=0.9093136457027569, grad=0.0)]

# Retrieve each dictionary's values
train_values = _train_loss[1]
 
# Generate a sequence of integers to represent the iteration numbers
n_iter = _train_loss[0]
 
# Plot and label the training and validation loss values
plt.plot(n_iter, train_values, label='Training Loss')
 
# Add in a title and axes labels
plt.title('Training Loss')
plt.xlabel('Iterations')
plt.ylabel('Loss')
 
# Set the tick locations
# plt.xticks(np.arange(0, 21, 2))
 
# Display the plot
plt.legend(loc='best')
plt.show()

Biological Neuron	Artificial Neuron
Cell Nucleus (Soma)	Node
Dendrites	Input
Synapse	Weights or interconnections
Axon	Output

micrograd¶

Table of Contents¶

Appendix¶

Figures¶

Tables¶

Equations¶

Definitions¶

References ¶

0. Introduction ¶

0.1. Derivatives: An Overview ¶

1. Micrograd: An Autograd Engine ¶

2. Micrograd: DAG Visualization ¶

3. Micrograd: Backpropagation ¶

3.1. Manual Backpropagation example 001: simple expression ¶

3.2. Preview of a Single Optimization Step ¶

4. Micrograd: Neurons ¶

4.1. Biological Neuron vs. Artificial Neuron ¶

Artificial Neuron at a Glance¶

4.2. Perceptron ¶

Basic Components of Perceptron¶

Types of Perceptron models¶

What is the Perceptron Model in Machine Learning?¶

How Does Perceptron Work?¶

4.3. Activation Functions ¶

4.4. Micrograd: Manual Backpropagation Example 002: A Neuron ¶

5. Micrograd: Implementing Backward Function for Each Operation ¶

6. Micrograd: Fixing a Backprop Bug when One Node is Used Multiple Times ¶

6.1. Multivariate Chain Rule for Derivatives ¶

7. Micrograd: Adding More Mathematical Operations ¶

7.1. Breaking up `tanh` Function into its Parts ¶

8. Micrograd: A `PyTorch` Comparison ¶

9. Micrograd: Building a NN Library (Multi-Layer Perceptron, MLP)¶

9.1. Neural Network architectures (cs231n)¶

Layer-wise organization¶

10. Micrograd: Creating a Tiny Dataset & Writing the Loss Function ¶

11. Micrograd: Neural Net Parameters ¶

12. Micrograd: Manual Gradient Descent Optimization & Training the MLP¶

13. Micrograd: Summary ¶

References¶

micrograd¶

Table of Contents¶

Appendix¶

Figures¶

Tables¶

Equations¶

Definitions¶

References¶

0. Introduction¶

0.1. Derivatives: An Overview¶

1. Micrograd: An Autograd Engine¶

2. Micrograd: DAG Visualization¶

3. Micrograd: Backpropagation¶

3.1. Manual Backpropagation example 001: simple expression¶

3.2. Preview of a Single Optimization Step¶

4. Micrograd: Neurons¶

4.1. Biological Neuron vs. Artificial Neuron¶

Artificial Neuron at a Glance¶

4.2. Perceptron¶

Basic Components of Perceptron¶

Types of Perceptron models¶

What is the Perceptron Model in Machine Learning?¶

How Does Perceptron Work?¶

4.3. Activation Functions¶

4.4. Micrograd: Manual Backpropagation Example 002: A Neuron¶

5. Micrograd: Implementing Backward Function for Each Operation¶

6. Micrograd: Fixing a Backprop Bug when One Node is Used Multiple Times¶

6.1. Multivariate Chain Rule for Derivatives¶

7. Micrograd: Adding More Mathematical Operations¶

7.1. Breaking up tanh Function into its Parts¶

8. Micrograd: A PyTorch Comparison¶

9. Micrograd: Building a NN Library (Multi-Layer Perceptron, MLP)¶

9.1. Neural Network architectures (cs231n)¶

Layer-wise organization¶

10. Micrograd: Creating a Tiny Dataset & Writing the Loss Function¶

11. Micrograd: Neural Net Parameters¶

12. Micrograd: Manual Gradient Descent Optimization & Training the MLP¶

13. Micrograd: Summary¶

References¶

References ¶

0. Introduction ¶

0.1. Derivatives: An Overview ¶

1. Micrograd: An Autograd Engine ¶

2. Micrograd: DAG Visualization ¶

3. Micrograd: Backpropagation ¶

3.1. Manual Backpropagation example 001: simple expression ¶

3.2. Preview of a Single Optimization Step ¶

4. Micrograd: Neurons ¶

4.1. Biological Neuron vs. Artificial Neuron ¶

4.2. Perceptron ¶

4.3. Activation Functions ¶

4.4. Micrograd: Manual Backpropagation Example 002: A Neuron ¶

5. Micrograd: Implementing Backward Function for Each Operation ¶

6. Micrograd: Fixing a Backprop Bug when One Node is Used Multiple Times ¶

6.1. Multivariate Chain Rule for Derivatives ¶

7. Micrograd: Adding More Mathematical Operations ¶

7.1. Breaking up `tanh` Function into its Parts ¶

8. Micrograd: A `PyTorch` Comparison ¶

10. Micrograd: Creating a Tiny Dataset & Writing the Loss Function ¶

11. Micrograd: Neural Net Parameters ¶

13. Micrograd: Summary ¶