1
0
Fork 0

dropout, training

pull/203/head^2
George Hotz 2020-12-28 22:12:23 -05:00
parent 7b8fee038d
commit 51bf164b72
2 changed files with 10 additions and 5 deletions

View File

@ -68,9 +68,9 @@ class TransformerBlock:
weights = score.softmax() # (bs, num_heads, T, T)
attention = weights.dot(value).transpose(order=(0,2,1,3)) # (bs, T, num_heads, head_size)
x = inputs + attention.reshape(shape=(-1, embed_dim)).dot(self.final)
x = inputs + attention.reshape(shape=(-1, embed_dim)).dot(self.final).dropout(0.1)
x = layernorm(x, embed_dim)
x = x + x.dot(self.ff1).relu().dot(self.ff2)
x = x + x.dot(self.ff1).relu().dot(self.ff2).dropout(0.1)
x = layernorm(x, embed_dim)
return x.reshape(shape=(bs, -1, embed_dim))
@ -107,6 +107,7 @@ if __name__ == "__main__":
optim = Adam(get_parameters(model), lr=0.001)
train(model, X_train, Y_train, optim, 500, BS=16)
Tensor.training = False
evaluate(model, X_test, Y_test, num_classes=10)

View File

@ -69,6 +69,7 @@ class Device: CPU, GPU, ANE = 0, 1, 2
class Tensor:
did_float_warning = False
training = True
ops = defaultdict(dict)
def __init__(self, data, device=Device.CPU, requires_grad=True):
@ -234,9 +235,12 @@ class Tensor:
return self.softmax().log()
def dropout(self, p=0.5):
_mask = np.asarray(np.random.binomial(1, 1.0-p, size=self.shape), dtype=self.dtype)
ret = self * Tensor(_mask, requires_grad=False, device=self.device)
return ret.div(1.0 - p)
if Tensor.training:
_mask = np.asarray(np.random.binomial(1, 1.0-p, size=self.shape), dtype=self.dtype)
ret = self * Tensor(_mask, requires_grad=False, device=self.device)
return ret.div(1.0 - p)
else:
return self
def abs(self):
return self.relu() + (-1.0*self).relu()