ops.py

from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import numpy as np
#

BaseOp represents an operation that performs computation on tensors. Every operation consists of the following:

  • A list of inputs, each converted to ensure they're all tensors.
  • An output tensor to represent the result of the operation (which might be None.)
  • A reference to the graph so that each operation can generate new operations when constructing gradients.
class BaseOp(object):
#
#
    def __init__(self, inputs, graph):
        self.inputs = [graph.convert(input_) for input_ in inputs]
        self.output = graph.tensor(op=self)
        self.graph = graph
#

The compute method receives as input the evaluated input tensors and returns the result of performing its operation on the inputs.

    def compute(self, sess, *args):
#
        raise NotImplementedError()
#

The gradient method computes the partial derivative w.r.t. each input to the operation. (Most of the derivatives come from Wikipedia.)

    def gradient(self, grad):
#
        raise NotImplementedError()
#

AddOp adds a tensor to another tensor. Uses the sum rule to compute the partial derivatives.

class AddOp(BaseOp):
#
#
    def compute(self, sess, a, b):
        return a + b
#
    def gradient(self, grad):
        return [grad, grad]
#

SubOp subtracts a tensor from another tensor. Also uses the sum rule to compute the partial derivatives.

class SubOp(BaseOp):
#
#
    def compute(self, sess, a, b):
        return a - b
#
    def gradient(self, grad):
        return [grad, -grad]
#

MulOp multiplies a tensor by another tensor. Uses the product rule to compute the partial derivatives.

class MulOp(BaseOp):
#
#
    def compute(self, sess, a, b):
        return a * b
#
    def gradient(self, grad):
        a, b = self.inputs
        return [grad * b, grad * a]
#

DivOp divides a tensor by another tensor. Uses the quotient rule to compute the partial derivatives.

class DivOp(BaseOp):
#
#
    def compute(self, sess, a, b):
        return a / b
#
    def gradient(self, grad):
        a, b = self.inputs
        return [grad / b, grad * (-a / self.graph.square(b))]
#

NegOp negates a tensor.

class NegOp(BaseOp):
#
#
    def compute(self, sess, x):
        return -x
#
    def gradient(self, grad):
        return [-grad]
#

DotOp computes the dot product between two tensors. Uses the product rule to compute the partial derivatives. Note that here we need to transpose the terms and perform a dot product, assuming matrices rather than scalars.

class DotOp(BaseOp):
#
#
    def compute(self, sess, a, b):
        return np.dot(a, b)
#
    def gradient(self, grad):
        a, b = self.inputs
        return [
            self.graph.dot(grad, self.graph.transpose(b)),
            self.graph.dot(self.graph.transpose(a), grad),
        ]
#

SquareOp squares a tensor.

class SquareOp(BaseOp):
#
#
    def compute(self, sess, x):
        return np.square(x)
#
    def gradient(self, grad):
        x = self.inputs[0]
        return [grad * (2 * x)]
#

TransposeOp tranposes a tensor.

class TransposeOp(BaseOp):
#
#
    def compute(self, sess, x):
        return np.transpose(x)
#
    def gradient(self, grad):
        return [self.graph.transpose(grad)]
#

SigmoidOp implements the sigmoid function and its derivative. Notice that the derivative uses the output of the operation which saves recomputation.

class SigmoidOp(BaseOp):
#
#
    def compute(self, sess, x):
        return 1 / (1 + np.exp(-x))
#
    def gradient(self, grad):
        y = self.output
        return [grad * (y * (1 - y))]
#

MeanOp computes the mean of a tensor. Note the gradient here is intentionally incorrect because computing it requires knowing the shape of the input and output tensors. Fortunately, gradients are fairly malleable in optimization.

class MeanOp(BaseOp):
#
#
    def compute(self, sess, x):
        return np.mean(x)
#
    def gradient(self, grad):
        return [grad]
#

GroupOp exploits the fact that each input to the operation is automatically evaluated before computing the operation's output, allowing us to group together the evaluation of multiple operations. It's input gradients come from simply broadcasting the output gradient.

class GroupOp(BaseOp):
#
#
    def compute(self, sess, *args):
        return None
#
    def gradient(self, grad):
        return [grad] * len(self.inputs)
#

AssignOp updates the session's current state for a tensor. It is not differentiable in this implementation.

class AssignOp(BaseOp):
#
#
    def compute(self, sess, a, b):
        assert a.shape == b.shape, \
            'shapes must match to assign: {} != {}' \
                .format(a.shape, b.shape)
        sess.state[self.inputs[0]] = b
        return b