Source code for epynn.gru.backward

# EpyNN/epynn/gru/backward.py
# Related third party imports
import numpy as np


[docs]def initialize_backward(layer, dX): """Backward cache initialization. :param layer: An instance of GRU layer. :type layer: :class:`epynn.gru.models.GRU` :param dX: Output of backward propagation from next layer. :type dX: :class:`numpy.ndarray` :return: Input of backward propagation for current layer. :rtype: :class:`numpy.ndarray` :return: Next hidden state initialized with zeros. :rtype: :class:`numpy.ndarray` """ if layer.sequences: dA = dX # Full length sequence elif not layer.sequences: dA = np.zeros(layer.fs['h']) # Empty full length sequence dA[:, -1] = dX # Assign to last index cache_keys = ['dh_', 'dh', 'dhn', 'dhh_', 'dz_', 'dr_'] layer.bc.update({k: np.zeros(layer.fs['h']) for k in cache_keys}) layer.bc['dA'] = dA layer.bc['dX'] = np.zeros(layer.fs['X']) # To previous layer dh = layer.bc['dh'][:, 0] # To previous step return dA, dh
[docs]def gru_backward(layer, dX): """Backward propagate error gradients to previous layer. """ # (1) Initialize cache and hidden state gradients dA, dh = initialize_backward(layer, dX) # Reverse iteration over sequence steps for s in reversed(range(layer.d['s'])): # (2s) Slice sequence (m, s, u) w.r.t step dA = layer.bc['dA'][:, s] # dL/dA # (3s) Retrieve previous hidden state dhn = layer.bc['dhn'][:, s] = dh # dL/dhn # (4s) Gradient of the loss w.r.t hidden state h_ dh_ = layer.bc['dh_'][:, s] = ( (dA + dhn) ) # dL/dh_ # (5s) Gradient of the loss w.r.t hidden hat hh_ dhh_ = layer.bc['dhh_'][:, s] = ( dh_ * (1-layer.fc['z'][:, s]) * layer.activate(layer.fc['hh_'][:, s], deriv=True) ) # dL/dhh_ # (6s) Gradient of the loss w.r.t update gate z_ dz_ = layer.bc['dz_'][:, s] = ( dh_ * (layer.fc['hp'][:, s]-layer.fc['hh'][:, s]) * layer.activate_update(layer.fc['z_'][:, s], deriv=True) ) # dL/dz_ # (7s) Gradient of the loss w.r.t reset gate dr_ = layer.bc['dr_'][:, s] = ( np.dot(dhh_, layer.p['Vhh'].T) * layer.fc['hp'][:, s] * layer.activate_reset(layer.fc['r_'][:, s], deriv=True) ) # dL/dr_ # (8s) Gradient of the loss w.r.t previous hidden state dh = layer.bc['dh'][:, s] = ( np.dot(dhh_, layer.p['Vhh'].T) * layer.fc['r'][:, s] + np.dot(dz_, layer.p['Vz'].T) + dh_ * layer.fc['z'][:, s] + np.dot(dr_, layer.p['Vr'].T) ) # dL/dh # (9s) Gradient of the loss w.r.t to X dX = layer.bc['dX'][:, s] = ( np.dot(dhh_, layer.p['Uhh'].T) + np.dot(dz_, layer.p['Uz'].T) + np.dot(dr_, layer.p['Ur'].T) ) # dL/dX dX = layer.bc['dX'] return dX # To previous layer