5

Recentemente sto imparando la rete neurale artificiale (ANN) e ho un codice funzionante e funzionante in Python per lo stesso basato sull'allenamento in mini-batch. Ho seguito il libro di Michael Nilson's Neural Networks and Deep Learning dove c'è una spiegazione passo passo di ogni algoritmo per i principianti. C'è anche un codice pienamente funzionante per il riconoscimento di cifre scritte a mano che funziona anche per me.Approccio full-matric a backpropagation nella rete neurale artificiale

Tuttavia, sto provando a modificare leggermente il codice per mezzo del passaggio dell'intero mini-lotto per addestrarlo con il backpropagation in forma di matrice. Ho anche sviluppato un codice funzionante per questo, ma il codice si comporta molto lentamente durante l'esecuzione. C'è un modo per implementare un approccio a matrice completa per l'apprendimento mini-batch della rete basato sull'algoritmo di propagazione posteriore?

import numpy as np 
import pandas as pd 

class Network: 

    def __init__(self, sizes): 
     self.layers = len(sizes) 
     self.sizes = sizes 

     self.biases = [np.random.randn(y, 1) for y in sizes[1:]] 
     self.weights = [np.random.randn(y, x) for y, x in zip(sizes[1:], sizes[:-1])] 

    def feed_forward(self, a): 
     for w, b in zip(self.weights, self.biases): 
      a = sigmoid(np.dot(w,a) + b) 
     return a 

    # Calculate the cost derivative (Gradient of C w.r.t. 'a' - Nabla C(a)) 
    def cost_derivative(self, output_activation, y): 
     return (output_activation - y) 


    def update_mini_batch(self, mini_batch, eta): 

     from scipy.linalg import block_diag 

     n = len(mini_batch) 

     xs = [x for x, y in mini_batch] 
     features = block_diag(*xs) 

     ys = [y for x, y in mini_batch] 
     responses = block_diag(*ys) 

     ws = [a for a in self.weights for i in xrange(n)] 

     new_list = [] 
     k = 0 
     while (k < len(ws)): 
      new_list.append(ws[k: k + n]) 
      k += n 

     weights = [block_diag(*elems) for elems in new_list] 

     bs = [b for b in self.biases for i in xrange(n)] 

     new_list2 = [] 
     j = 0 
     while (j < len(bs)): 
      new_list2.append(bs[j : j + n]) 
      j += n 

     biases = [block_diag(*elems) for elems in new_list2] 

     baises_dim_1 = [np.dot(np.ones((n*b.shape[0], b.shape[0])), b) for b in self.biases] 
     biases_dim_2 = [np.dot(b, np.ones((b.shape[1], n*b.shape[1]))) for b in baises_dim_1] 
     weights_dim_1 = [np.dot(np.ones((n*w.shape[0], w.shape[0])), w) for w in self.weights] 
     weights_dim_2 = [np.dot(w, np.ones((w.shape[1], n*w.shape[1]))) for w in weights_dim_1] 

     nabla_b = [np.zeros(b.shape) for b in biases_dim_2] 
     nabla_w = [np.zeros(w.shape) for w in weights_dim_2] 

     delta_b = [np.zeros(b.shape) for b in self.biases] 
     delta_w = [np.zeros(w.shape) for w in self.weights] 

     zs = [] 
     activation = features 
     activations = [features] 

     for w, b in zip(weights, biases): 

      z = np.dot(w, activation) + b 
      zs.append(z) 
      activation = sigmoid(z) 
      activations.append(activation) 

     delta = self.cost_derivative(activations[-1], responses) * sigmoid_prime(zs[-1]) 
     nabla_b[-1] = delta 
     nabla_w[-1] = np.dot(delta, activations[-2].transpose()) 

     for l in xrange(2, self.layers): 
      z = zs[-l]                  # the weighted input for that layer 
      activation_prime = sigmoid_prime(z)            # the derivative of activation for the layer 
      delta = np.dot(weights[-l + 1].transpose(), delta) * activation_prime   # calculate the adjustment term (delta) for that layer 
      nabla_b[-l] = delta                # calculate the bias adjustments - by means of using eq-BP3. 
      nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())     # calculate the weight adjustments - by means of using eq-BP4. 

     delta_b = [self.split_cases(b, n) for b in nabla_b] 
     delta_w = [self.split_cases(w, n) for w in nabla_w] 

     self.weights = [w - (eta/n) * nw for w, nw in zip(self.weights, delta_w)] 
     self.biases = [b - (eta/ n) * nb for b, nb in zip(self.biases, delta_b)] 



    def split_cases(self, mat, mini_batch_size): 
     i = 0 
     j = 0 
     dim1 = mat.shape[0]/mini_batch_size 
     dim2 = mat.shape[1]/mini_batch_size 
     sum_samples = np.zeros((dim1, dim2)) 
     while i < len(mat): 

      sum_samples = sum_samples + mat[i: i + dim1, j : j + dim2] 
      i += dim1 
      j += dim2 

     return sum_samples 

    """Stochastic Gradient Descent for training in epochs""" 
    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data = None): 

     n = len(training_data) 

     if test_data: 
      n_test = len(test_data) 

     for j in xrange(epochs): 
      np.random.shuffle(training_data)                 # for each epochs the mini-batches are selected randomly 
      mini_batches = [training_data[k: k+mini_batch_size] for k in xrange(0, n, mini_batch_size)]  # select equal sizes of mini-batches for the epochs (last mini_batch size might differ however) 

      c = 1 

      for mini_batch in mini_batches: 
       print "Updating mini-batch {0}".format(c) 
       self.update_mini_batch(mini_batch, eta) 
       c += 1 
      if test_data: 
       print "Epoch {0}: {1}/{2}".format(j, self.evaluate(test_data), n_test) 

      else: 
       print "Epoch {0} completed.".format(j) 

    def evaluate(self, test_data): 
     test_results = [(np.argmax(self.feed_forward(x)), y) for (x, y) in test_data] 
     return (sum(int(x == y) for x, y in test_results)) 

    def export_results(self, test_data): 
     results = [(np.argmax(self.feed_forward(x)), y) for (x, y) in test_data] 
     k = pd.DataFrame(results) 
     k.to_csv('net_results.csv') 


# Global functions 

## Activation function (sigmoid) 
@np.vectorize 
def sigmoid(z): 
    return 1.0/(1.0 + np.exp(-z)) 

## Activation derivative (sigmoid_prime) 
@np.vectorize 
def sigmoid_prime(z): 
    return sigmoid(z)*(1 - sigmoid(z)) 

risposta

1

Ecco il mio codice. Il tempo impiegato per iterare 30 epoche si riduce da 800+ secondi a 200+ secondi sulla mia macchina.

Siccome sono nuovo in Python, utilizzo ciò che è prontamente disponibile. Questo snippet richiede solo numpy per essere eseguito.

Provalo.

def feedforward2(self, a): 
    zs = [] 
    activations = [a] 

    activation = a 
    for b, w in zip(self.biases, self.weights): 
     z = np.dot(w, activation) + b 
     zs.append(z) 
     activation = sigmoid(z) 
     activations.append(activation) 

    return (zs, activations) 

def update_mini_batch2(self, mini_batch, eta): 
    batch_size = len(mini_batch) 

    # transform to (input x batch_size) matrix 
    x = np.asarray([_x.ravel() for _x, _y in mini_batch]).transpose() 
    # transform to (output x batch_size) matrix 
    y = np.asarray([_y.ravel() for _x, _y in mini_batch]).transpose() 

    nabla_b, nabla_w = self.backprop2(x, y) 
    self.weights = [w - (eta/batch_size) * nw for w, nw in zip(self.weights, nabla_w)] 
    self.biases = [b - (eta/batch_size) * nb for b, nb in zip(self.biases, nabla_b)] 

    return 

def backprop2(self, x, y): 

    nabla_b = [0 for i in self.biases] 
    nabla_w = [0 for i in self.weights] 

    # feedforward 
    zs, activations = self.feedforward2(x) 

    # backward pass 
    delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1]) 
    nabla_b[-1] = delta.sum(1).reshape([len(delta), 1]) # reshape to (n x 1) matrix 
    nabla_w[-1] = np.dot(delta, activations[-2].transpose()) 

    for l in xrange(2, self.num_layers): 
     z = zs[-l] 
     sp = sigmoid_prime(z) 
     delta = np.dot(self.weights[-l + 1].transpose(), delta) * sp 
     nabla_b[-l] = delta.sum(1).reshape([len(delta), 1]) # reshape to (n x 1) matrix 
     nabla_w[-l] = np.dot(delta, activations[-l - 1].transpose()) 

    return (nabla_b, nabla_w)