from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import numpy as np
BaseOp
represents an operation that performs computation on tensors.
Every operation consists of the following:
inputs
, each converted to ensure they're all tensors.None
.)class BaseOp(object):
def __init__(self, inputs, graph):
self.inputs = [graph.convert(input_) for input_ in inputs]
self.output = graph.tensor(op=self)
self.graph = graph
The compute
method receives as input the evaluated input tensors
and returns the result of performing its operation on the inputs.
def compute(self, sess, *args):
raise NotImplementedError()
The gradient
method computes the partial derivative w.r.t. each input
to the operation. (Most of the derivatives come from
Wikipedia.)
def gradient(self, grad):
raise NotImplementedError()
class AddOp(BaseOp):
def compute(self, sess, a, b):
return a + b
def gradient(self, grad):
return [grad, grad]
SubOp
subtracts a tensor from another tensor. Also uses the
sum rule to
compute the partial derivatives.
class SubOp(BaseOp):
def compute(self, sess, a, b):
return a - b
def gradient(self, grad):
return [grad, -grad]
MulOp
multiplies a tensor by another tensor. Uses the
product rule to compute the
partial derivatives.
class MulOp(BaseOp):
def compute(self, sess, a, b):
return a * b
def gradient(self, grad):
a, b = self.inputs
return [grad * b, grad * a]
DivOp
divides a tensor by another tensor. Uses the
quotient rule to compute the
partial derivatives.
class DivOp(BaseOp):
def compute(self, sess, a, b):
return a / b
def gradient(self, grad):
a, b = self.inputs
return [grad / b, grad * (-a / self.graph.square(b))]
NegOp
negates a tensor.
class NegOp(BaseOp):
def compute(self, sess, x):
return -x
def gradient(self, grad):
return [-grad]
DotOp
computes the dot product between two tensors. Uses the
product rule to compute the
partial derivatives. Note that here we need to transpose the terms and
perform a dot product, assuming matrices rather than scalars.
class DotOp(BaseOp):
def compute(self, sess, a, b):
return np.dot(a, b)
def gradient(self, grad):
a, b = self.inputs
return [
self.graph.dot(grad, self.graph.transpose(b)),
self.graph.dot(self.graph.transpose(a), grad),
]
SquareOp
squares a tensor.
class SquareOp(BaseOp):
def compute(self, sess, x):
return np.square(x)
def gradient(self, grad):
x = self.inputs[0]
return [grad * (2 * x)]
TransposeOp
tranposes a tensor.
class TransposeOp(BaseOp):
def compute(self, sess, x):
return np.transpose(x)
def gradient(self, grad):
return [self.graph.transpose(grad)]
SigmoidOp
implements the
sigmoid function and its
derivative. Notice that the derivative uses the output of the operation
which saves recomputation.
class SigmoidOp(BaseOp):
def compute(self, sess, x):
return 1 / (1 + np.exp(-x))
def gradient(self, grad):
y = self.output
return [grad * (y * (1 - y))]
MeanOp
computes the mean of a tensor. Note the gradient here is
intentionally incorrect because computing it requires knowing the shape of
the input and output tensors. Fortunately, gradients are fairly malleable
in optimization.
class MeanOp(BaseOp):
def compute(self, sess, x):
return np.mean(x)
def gradient(self, grad):
return [grad]
GroupOp
exploits the fact that each input to the operation is
automatically evaluated before computing the operation's output, allowing
us to group together the evaluation of multiple operations. It's input
gradients come from simply broadcasting the output gradient.
class GroupOp(BaseOp):
def compute(self, sess, *args):
return None
def gradient(self, grad):
return [grad] * len(self.inputs)
AssignOp
updates the session's current state for a tensor. It is not
differentiable in this implementation.
class AssignOp(BaseOp):
def compute(self, sess, a, b):
assert a.shape == b.shape, \
'shapes must match to assign: {} != {}' \
.format(a.shape, b.shape)
sess.state[self.inputs[0]] = b
return b