Recentemente sto imparando la rete neurale artificiale (ANN) e ho un codice funzionante e funzionante in Python per lo stesso basato sull'allenamento in mini-batch. Ho seguito il libro di Michael Nilson's Neural Networks and Deep Learning dove c'è una spiegazione passo passo di ogni algoritmo per i principianti. C'è anche un codice pienamente funzionante per il riconoscimento di cifre scritte a mano che funziona anche per me.Approccio full-matric a backpropagation nella rete neurale artificiale
Tuttavia, sto provando a modificare leggermente il codice per mezzo del passaggio dell'intero mini-lotto per addestrarlo con il backpropagation in forma di matrice. Ho anche sviluppato un codice funzionante per questo, ma il codice si comporta molto lentamente durante l'esecuzione. C'è un modo per implementare un approccio a matrice completa per l'apprendimento mini-batch della rete basato sull'algoritmo di propagazione posteriore?
import numpy as np
import pandas as pd
class Network:
def __init__(self, sizes):
self.layers = len(sizes)
self.sizes = sizes
self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
self.weights = [np.random.randn(y, x) for y, x in zip(sizes[1:], sizes[:-1])]
def feed_forward(self, a):
for w, b in zip(self.weights, self.biases):
a = sigmoid(np.dot(w,a) + b)
return a
# Calculate the cost derivative (Gradient of C w.r.t. 'a' - Nabla C(a))
def cost_derivative(self, output_activation, y):
return (output_activation - y)
def update_mini_batch(self, mini_batch, eta):
from scipy.linalg import block_diag
n = len(mini_batch)
xs = [x for x, y in mini_batch]
features = block_diag(*xs)
ys = [y for x, y in mini_batch]
responses = block_diag(*ys)
ws = [a for a in self.weights for i in xrange(n)]
new_list = []
k = 0
while (k < len(ws)):
new_list.append(ws[k: k + n])
k += n
weights = [block_diag(*elems) for elems in new_list]
bs = [b for b in self.biases for i in xrange(n)]
new_list2 = []
j = 0
while (j < len(bs)):
new_list2.append(bs[j : j + n])
j += n
biases = [block_diag(*elems) for elems in new_list2]
baises_dim_1 = [np.dot(np.ones((n*b.shape[0], b.shape[0])), b) for b in self.biases]
biases_dim_2 = [np.dot(b, np.ones((b.shape[1], n*b.shape[1]))) for b in baises_dim_1]
weights_dim_1 = [np.dot(np.ones((n*w.shape[0], w.shape[0])), w) for w in self.weights]
weights_dim_2 = [np.dot(w, np.ones((w.shape[1], n*w.shape[1]))) for w in weights_dim_1]
nabla_b = [np.zeros(b.shape) for b in biases_dim_2]
nabla_w = [np.zeros(w.shape) for w in weights_dim_2]
delta_b = [np.zeros(b.shape) for b in self.biases]
delta_w = [np.zeros(w.shape) for w in self.weights]
zs = []
activation = features
activations = [features]
for w, b in zip(weights, biases):
z = np.dot(w, activation) + b
zs.append(z)
activation = sigmoid(z)
activations.append(activation)
delta = self.cost_derivative(activations[-1], responses) * sigmoid_prime(zs[-1])
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
for l in xrange(2, self.layers):
z = zs[-l] # the weighted input for that layer
activation_prime = sigmoid_prime(z) # the derivative of activation for the layer
delta = np.dot(weights[-l + 1].transpose(), delta) * activation_prime # calculate the adjustment term (delta) for that layer
nabla_b[-l] = delta # calculate the bias adjustments - by means of using eq-BP3.
nabla_w[-l] = np.dot(delta, activations[-l-1].transpose()) # calculate the weight adjustments - by means of using eq-BP4.
delta_b = [self.split_cases(b, n) for b in nabla_b]
delta_w = [self.split_cases(w, n) for w in nabla_w]
self.weights = [w - (eta/n) * nw for w, nw in zip(self.weights, delta_w)]
self.biases = [b - (eta/ n) * nb for b, nb in zip(self.biases, delta_b)]
def split_cases(self, mat, mini_batch_size):
i = 0
j = 0
dim1 = mat.shape[0]/mini_batch_size
dim2 = mat.shape[1]/mini_batch_size
sum_samples = np.zeros((dim1, dim2))
while i < len(mat):
sum_samples = sum_samples + mat[i: i + dim1, j : j + dim2]
i += dim1
j += dim2
return sum_samples
"""Stochastic Gradient Descent for training in epochs"""
def SGD(self, training_data, epochs, mini_batch_size, eta, test_data = None):
n = len(training_data)
if test_data:
n_test = len(test_data)
for j in xrange(epochs):
np.random.shuffle(training_data) # for each epochs the mini-batches are selected randomly
mini_batches = [training_data[k: k+mini_batch_size] for k in xrange(0, n, mini_batch_size)] # select equal sizes of mini-batches for the epochs (last mini_batch size might differ however)
c = 1
for mini_batch in mini_batches:
print "Updating mini-batch {0}".format(c)
self.update_mini_batch(mini_batch, eta)
c += 1
if test_data:
print "Epoch {0}: {1}/{2}".format(j, self.evaluate(test_data), n_test)
else:
print "Epoch {0} completed.".format(j)
def evaluate(self, test_data):
test_results = [(np.argmax(self.feed_forward(x)), y) for (x, y) in test_data]
return (sum(int(x == y) for x, y in test_results))
def export_results(self, test_data):
results = [(np.argmax(self.feed_forward(x)), y) for (x, y) in test_data]
k = pd.DataFrame(results)
k.to_csv('net_results.csv')
# Global functions
## Activation function (sigmoid)
@np.vectorize
def sigmoid(z):
return 1.0/(1.0 + np.exp(-z))
## Activation derivative (sigmoid_prime)
@np.vectorize
def sigmoid_prime(z):
return sigmoid(z)*(1 - sigmoid(z))