Looking at pytorch sources, I see two implementations:
caffe (no longer used?)
const float nexpY = exp(-Y[i]);
dX[i] = dY[i] * (1 - nexpY);
aten (a=X, b=Y)
scalar_t z = std::exp(b * beta);
return (b * beta) > threshold ? a : a * (z - scalar_t(1.)) / z;
below threshold case is equiv. to caffe:
dX = dY * (exp(Y)-1)/exp(Y) = dY * (1 - exp(-Y))
Notice that caffe version doesn’t need the X tensor. So, are there any problems with using that? (in the most common case, with beta=1, and hessian not needed). Is caffe implementation built-in/exported to python?
tom
(Thomas V)
August 30, 2020, 6:46pm
2
To be honest, I don’t think self
(the input to the forward) actually used:
const Tensor& grad_output,
const Tensor& self,
Scalar beta,
Scalar threshold,
const Tensor& output) {
auto iter = TensorIterator::binary_op(grad_input, grad_output, output);
softplus_backward_stub(iter.device_type(), iter, beta, threshold);
return grad_input;
}
Tensor softplus_backward(
const Tensor& grad_output,
const Tensor& self,
Scalar beta,
Scalar threshold,
const Tensor& output) {
Tensor grad_input;
auto iter = TensorIterator::binary_op(grad_input, grad_output, output);
softplus_backward_stub(iter.device_type(), iter, beta, threshold);
return iter.output();
}
My bad, “a” above binds to grad_output. So, I presume capturing of “self” qualifies as a bug then.
On second look, it seems it is done for softplus_double_backward. Interesting tradeoff, simpler (I assume) formula there vs extra memory for a regular case…
}
Tensor soft_margin_loss_double_backward_grad_output(const Tensor & grad, const Tensor & grad_output, const Tensor & input, const Tensor & target, int64_t reduction) {
if (reduction == at::Reduction::None) {
return soft_margin_loss_backward(grad, input, target, reduction);
}
auto r = soft_margin_loss_backward(ones_like(grad_output), input, target, reduction);
return (r * grad).sum();
}
Tensor softplus_double_backward(const Tensor & grad, const Tensor & input, Scalar beta, Scalar threshold) {
auto x = (input * beta);
return sigmoid_backward(grad, x.sigmoid()) * (x < threshold).type_as(grad) * beta;
}
// NOTE [ as_strided Backward and layout-aware/agnostic autograd ]
//
// `storage_offset` is ignored for simplicity in this note. If you just want the
// full algorithm without explanation, scroll down to bottom of this note.
//
tom
(Thomas V)
August 31, 2020, 12:02pm
5
So one thing one could see if the JIT decomposes this more nicely (I don’t know, but it could be, as that has it’s own differentiation scheme and eliminates unneeded variables).
Nope. Here is my test script and alternative version.
from torch import *
from torch import jit
from torch.nn import functional as F
import gc
#no effect on results
#torch._C._jit_set_profiling_mode(False)
#torch._C._jit_set_profiling_executor(False)
@jit.script
def jit_softplus(x):
return F.softplus(x)
@jit.script
def fused_softplus(x):
return x.exp().add(1.0).log()
class LowMemSoftplus(torch.autograd.Function):
@staticmethod
def forward(ctx, x):
y = F.softplus(x)
ctx.save_for_backward(y)
return y
@staticmethod
def backward(ctx, dLoss_dOutput):
softplus_of_x, = ctx.saved_tensors
dOutput_dInput = softplus_of_x.neg().exp_()
torch.sub(1.0, dOutput_dInput, out=dOutput_dInput)
return dLoss_dOutput * dOutput_dInput
for title, f in {
"nograd": F.softplus, "grad": F.softplus,
"jitgrad": jit_softplus, "jitfuse": fused_softplus,
"custom": LowMemSoftplus.apply,
}.items():
mem0 = torch.cuda.memory_allocated()
p = torch.ones(5000, device="cuda").requires_grad_(title!="nograd") #this leaf tensor is kept alive due to .grad
x = p[:, None] @ p[None, :] #non leaf tensor with 25m elements
y = f(x)
torch.cuda.synchronize()
del x,f
gc.collect()
print(title,":",torch.cuda.memory_allocated() - mem0)
del y,p
gc.collect()
assert torch.cuda.memory_allocated() == mem0
output
nograd : 100683776
grad : 201347072
jitgrad : 201347072
jitfuse : 302010368
custom : 100683776
1 Like