Source code for epynn.lstm.backward

# EpyNN/epynn/lstm/backward.py
# Related third party imports
import numpy as np


[docs]def initialize_backward(layer, dX): """Backward cache initialization. :param layer: An instance of LSTM layer. :type layer: :class:`epynn.lstm.models.LSTM` :param dX: Output of backward propagation from next layer. :type dX: :class:`numpy.ndarray` :return: Input of backward propagation for current layer. :rtype: :class:`numpy.ndarray` :return: Next hidden state initialized with zeros. :rtype: :class:`numpy.ndarray` :return: Next memory state initialized with zeros. :rtype: :class:`numpy.ndarray` """ if layer.sequences: dA = dX # Full length sequence elif not layer.sequences: dA = np.zeros(layer.fs['h']) # Empty full length sequence dA[:, -1] = dX # Assign to last index cache_keys = [ 'dh_', 'dh', 'dhn', 'dC_', 'dC', 'dCn', 'do_', 'dg_', 'di_', 'df_', 'dz_' ] layer.bc.update({k: np.zeros(layer.fs['h']) for k in cache_keys}) layer.bc['dA'] = dA layer.bc['dX'] = np.zeros(layer.fs['X']) # To previous layer dh = layer.bc['dh'][:, 0] # To previous step dC = layer.bc['dC'][:, 0] # To previous step return dA, dh, dC
[docs]def lstm_backward(layer, dX): """Backward propagate error gradients to previous layer. """ # (1) Initialize cache, hidden and memory state gradients dA, dh, dC = initialize_backward(layer, dX) # Reverse iteration over sequence steps for s in reversed(range(layer.d['s'])): # (2s) Slice sequence (m, s, u) w.r.t step dA = layer.bc['dA'][:, s] # dL/dA # (3s) Gradient of the loss w.r.t. next states dhn = layer.bc['dhn'][:, s] = dh # (3.1) dL/dhn dCn = layer.bc['dCn'][:, s] = dC # (3.2) dL/dCn # (4s) Gradient of the loss w.r.t hidden state h_ dh_ = layer.bc['dh_'][:, s] = ( (dA + dhn) ) # dL/dh_ # (5s) Gradient of the loss w.r.t memory state C_ dC_ = layer.bc['dC_'][:, s] = ( dh_ * layer.fc['o'][:, s] * layer.activate(layer.fc['C_'][:, s], deriv=True) + dCn ) # dL/dC_ # (6s) Gradient of the loss w.r.t output gate o_ do_ = layer.bc['do_'][:, s] = ( dh_ * layer.fc['C'][:, s] * layer.activate_output(layer.fc['o_'][:, s], deriv=True) ) # dL/do_ # (7s) Gradient of the loss w.r.t candidate g_ dg_ = layer.bc['dg_'][:, s] = ( dC_ * layer.fc['i'][:, s] * layer.activate_candidate(layer.fc['g_'][:, s], deriv=True) ) # dL/dg_ # (8s) Gradient of the loss w.r.t input gate i_ di_ = layer.bc['di_'][:, s] = ( dC_ * layer.fc['g'][:, s] * layer.activate_input(layer.fc['i_'][:, s], deriv=True) ) # dL/di_ # (9s) Gradient of the loss w.r.t forget gate f_ df_ = layer.bc['df_'][:, s] = ( dC_ * layer.fc['Cp_'][:, s] * layer.activate_forget(layer.fc['f_'][:, s], deriv=True) ) # dL/df_ # (10s) Gradient of the loss w.r.t memory state C dC = layer.bc['dC'][:, s] = ( dC_ * layer.fc['f'][:, s] ) # dL/dC # (11s) Gradient of the loss w.r.t hidden state h dh = layer.bc['dh'][:, s] = ( np.dot(do_, layer.p['Vo'].T) + np.dot(dg_, layer.p['Vg'].T) + np.dot(di_, layer.p['Vi'].T) + np.dot(df_, layer.p['Vf'].T) ) # dL/dh # (12s) Gradient of the loss w.r.t hidden state X dX = layer.bc['dX'][:, s] = ( np.dot(dg_, layer.p['Ug'].T) + np.dot(do_, layer.p['Uo'].T) + np.dot(di_, layer.p['Ui'].T) + np.dot(df_, layer.p['Uf'].T) ) # dL/dX dX = layer.bc['dX'] return dX # To previous layer