From 35b37b1a5a64931bdf5040d98e9795bdf7f6a712 Mon Sep 17 00:00:00 2001 From: Jeff Moe Date: Wed, 6 Dec 2023 14:45:18 -0700 Subject: [PATCH] nn init docstrings --- tinygrad/nn/__init__.py | 301 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 301 insertions(+) diff --git a/tinygrad/nn/__init__.py b/tinygrad/nn/__init__.py index b566434ad..96c08ba7d 100644 --- a/tinygrad/nn/__init__.py +++ b/tinygrad/nn/__init__.py @@ -6,6 +6,17 @@ from tinygrad.nn import optim, state # noqa: F401 class BatchNorm2d: + """ + This class is for 2D batch normalization. It scales and shifts the input tensor x using the calculated mean and variance. + + Attributes: + sz (int): The size of the tensor. + eps (float): A small constant added to the variance to prevent division by zero. Default is 1e-5. + affine (bool): A boolean value that when set to True, the constructor learns and applies scale and bias. Default is True. + track_running_stats (bool): A boolean value that when set to True, this module tracks the running mean and variance. Default is True. + momentum (float): The value used for the moving average of the mean and variance. Default is 0.1. + """ + def __init__( self, sz: int, eps=1e-5, affine=True, track_running_stats=True, momentum=0.1 ): @@ -26,6 +37,15 @@ class BatchNorm2d: self.num_batches_tracked = Tensor.zeros(1, requires_grad=False) def __call__(self, x: Tensor): + """ + This function performs the forward pass of the batch normalization layer. + + Args: + x (Tensor): The input tensor to be normalized. + + Returns: + Tensor: The normalized output tensor. + """ if Tensor.training: # This requires two full memory accesses to x # https://github.com/pytorch/pytorch/blob/c618dc13d2aa23625cb0d7ada694137532a4fa33/aten/src/ATen/native/cuda/Normalization.cuh @@ -73,6 +93,34 @@ def Conv1d( groups=1, bias=True, ): + """ + Function Signature: Conv1d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True) + + **Description**: + This function is a wrapper for the PyTorch convolutional layer (Conv2d). It takes in parameters like in_channels, out_channels, kernel_size, stride, padding, dilation, groups, and bias. The kernel_size is automatically converted into a tuple of size 1 for the Conv2d function. + + **Parameters**: + + :param in_channels: Number of channels in the input image + :type in_channels: int + :param out_channels: Number of channels produced by the convolution + :type out_channels: int + :param kernel_size: Size of the convolving kernel + :type kernel_size: int + :param stride: Stride of the convolution, default is 1 + :type stride: int, optional + :param padding: Zero-padding added to both sides of the input, default is 0 + :type padding: int, optional + :param dilation: Spacing between the kernel points, default is 1 + :type dilation: int, optional + :param groups: Number of blocked connections from input channels to output channels, default is 1 + :type groups: int, optional + :param bias: If True, adds a learnable bias to the output, default is True + :type bias: bool, optional + + **Returns**: + The Conv2d layer with converted parameters + """ return Conv2d( in_channels, out_channels, @@ -86,6 +134,20 @@ def Conv1d( class Conv2d: + """ + This class defines a 2D convolutional layer. + + Attributes: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_size (tuple): Size of the convolving kernel. + stride (int): Stride of the convolution. Default is 1. + padding (int): Padding added to both sides of the input. Default is 0. + dilation (int): Spacing between kernel elements. Default is 1. + groups (int): Number of blocked connections from input channels to output channels. Default is 1. + bias (bool): If True, adds a learnable bias to the output. Default is True. + """ + def __init__( self, in_channels, @@ -116,6 +178,15 @@ class Conv2d: ) def __call__(self, x: Tensor): + """ + Forward pass through the layer. + + Args: + x (Tensor): Input tensor of shape (batch_size, in_channels, height, width). + + Returns: + Tensor: Output tensor after convolution of shape (batch_size, out_channels, new_height, new_width). + """ return x.conv2d( self.weight, self.bias, @@ -126,6 +197,17 @@ class Conv2d: ) def initialize_weight(self, out_channels, in_channels, groups): + """ + Initialize the weight tensor using the Kaiming uniform initialization method. + + Args: + out_channels (int): Number of output channels. + in_channels (int): Number of input channels per group. + groups (int): Number of blocked connections from input channels to output channels. + + Returns: + Tensor: Initialized weight tensor of shape (out_channels, in_channels // groups, *kernel_size). + """ return Tensor.kaiming_uniform( out_channels, in_channels // groups, *self.kernel_size, a=math.sqrt(5) ) @@ -142,6 +224,36 @@ def ConvTranspose1d( groups=1, bias=True, ): + """ + Function Signature: ConvTranspose1d(in_channels, out_channels, kernel_size, stride=1, padding=0, output_padding=0, dilation=1, groups=1, bias=True) + + **Description**: + This function creates a 1D convolutional transpose layer. + + **Parameters**: + Attributes: + in_channels : int + Number of input channels. + out_channels : int + Number of output channels. + kernel_size : int + Size of the convolving kernel. + stride : int, default=1 + Stride of the convolution. + padding : int, default=0 + Zero-padding added to both sides of the input. + output_padding : int, default=0 + Additional size added to one side of the output shape. + dilation : int, default=1 + Spacing between kernel elements. + groups : int, default=1 + Number of blocked connections from input channels to output channels. + bias : bool, default=True + If True, adds a learnable bias to the output. + + **Returns**: + ConvTranspose2d object with specified attributes. + """ return ConvTranspose2d( in_channels, out_channels, @@ -156,6 +268,21 @@ def ConvTranspose1d( class ConvTranspose2d(Conv2d): + """ + This class defines a 2D transposed convolution layer, often used in image segmentation tasks. + + Attributes: + in_channels (int): The number of input channels. + out_channels (int): The number of output channels. + kernel_size (tuple): The size of the convolving kernel. + stride (int): Stride of the convolution. Defaults to 1. + padding (int): Zero-padding added to both sides of the input. Defaults to 0. + output_padding (int): Additional size added to one side of the output shape. Defaults to 0. + dilation (int): Spacing between kernel elements. Defaults to 1. + groups (int): Number of blocked connections from input channels to output channels. Defaults to 1. + bias (bool): If True, adds a learnable bias to the output. Defaults to True. + """ + def __init__( self, in_channels, @@ -181,6 +308,17 @@ class ConvTranspose2d(Conv2d): self.output_padding = output_padding def __call__(self, x: Tensor): + """ + Forward pass method for the ConvTranspose2d layer. + + Args: + x (Tensor): Input tensor of shape (batch_size, in_channels, height, width). + + Returns: + Tensor: Output tensor of shape (batch_size, out_channels, height', width'), where height' and width' are computed by: + output_height = (height - 1) * stride + kernel_size - 2 * padding + output_padding + output_width = (width - 1) * stride + kernel_size - 2 * padding + output_padding + """ return x.conv_transpose2d( self.weight, self.bias, @@ -192,13 +330,43 @@ class ConvTranspose2d(Conv2d): ) def initialize_weight(self, out_channels, in_channels, groups): + """ + Initializes the weight tensor using the Kaiming uniform initialization method. + + Args: + out_channels (int): The number of output channels. + in_channels (int): The number of input channels. + groups (int): Number of blocked connections from input channels to output channels. + + Returns: + Tensor: Initialized weight tensor with shape (out_channels, in_channels // groups, *kernel_size). + """ return Tensor.kaiming_uniform( in_channels, out_channels // groups, *self.kernel_size, a=math.sqrt(5) ) class Linear: + """ + A class representing a linear transformation. + + Attributes: + in_features (int): The number of input features. + out_features (int): The number of output features. + bias (bool, optional): If True, adds a learnable bias to the output. Defaults to True. + + """ + def __init__(self, in_features, out_features, bias=True): + """ + Initializes the Linear class with the given input and output features. + + Args: + in_features (int): The number of input features. + out_features (int): The number of output features. + bias (bool, optional): If True, adds a learnable bias to the output. Defaults to True. + + """ self.weight = Tensor.kaiming_uniform(out_features, in_features, a=math.sqrt(5)) # TODO: remove this once we can represent Tensor with int shape in typing assert isinstance(self.weight.shape[1], int), "does not support symbolic shape" @@ -208,10 +376,30 @@ class Linear: ) def __call__(self, x: Tensor): + """ + Applies the linear transformation to the input tensor. + + Args: + x (Tensor): The input tensor. + + Returns: + Tensor: The output tensor after applying the linear transformation. + + """ return x.linear(self.weight.transpose(), self.bias) class GroupNorm: + """ + Apply group normalization to the input tensor. + + Attributes: + num_groups (int): The number of groups to separate the channels into. + num_channels (int): The number of channels in the input tensor. + eps (float): A small constant added to the variance to prevent division by zero. Default is 1e-5. + affine (bool): A boolean value that when set to True, this module has learnable affine parameters. Default is True. + """ + def __init__( self, num_groups: int, num_channels: int, eps: float = 1e-5, affine: bool = True ): @@ -220,6 +408,15 @@ class GroupNorm: self.bias: Optional[Tensor] = Tensor.zeros(num_channels) if affine else None def __call__(self, x: Tensor): + """ + Normalize the input tensor. + + Args: + x (Tensor): The input tensor of shape [batch_size, num_channels, ...]. + + Returns: + Tensor: The normalized tensor with the same shape as the input tensor. + """ # reshape for layernorm to work as group norm # subtract mean and divide stddev x = ( @@ -237,12 +434,30 @@ class GroupNorm: class InstanceNorm: + """ + Class that implements Instance Normalization for tensors. + + Attributes: + num_features (int): The number of features in the tensor. + eps (float): A small constant added to the standard deviation to avoid division by zero. Default is 1e-5. + affine (bool): If True, apply learned scale and shift parameters. Default is True. + """ + def __init__(self, num_features: int, eps: float = 1e-5, affine: bool = True): self.num_features, self.eps = num_features, eps self.weight: Optional[Tensor] = Tensor.ones(num_features) if affine else None self.bias: Optional[Tensor] = Tensor.zeros(num_features) if affine else None def __call__(self, x: Tensor): + """ + Normalize the input tensor using instance normalization. + + Args: + x (Tensor): The input tensor to be normalized. + + Returns: + Tensor: The normalized tensor. If affine is False, returns the original tensor after normalizing its channels. + """ x = ( x.reshape(x.shape[0], self.num_features, -1) .layernorm(eps=self.eps) @@ -256,12 +471,29 @@ class InstanceNorm: class LayerNorm: + """ + Implements layer normalization. + + Attributes: + normalized_shape (Union[int, Tuple[int, ...]]): The shape of the input tensor to be normalized. + eps (float): A small constant added to the variance to prevent division by zero. Default is 1e-5. + elementwise_affine (bool): If True, apply learned scale and shift parameters to the output. Default is True. + """ + def __init__( self, normalized_shape: Union[int, Tuple[int, ...]], eps: float = 1e-5, elementwise_affine: bool = True, ): + """ + Initializes the layer normalization instance. + + Args: + normalized_shape (Union[int, Tuple[int, ...]]): The shape of the input tensor to be normalized. + eps (float): A small constant added to the variance to prevent division by zero. Default is 1e-5. + elementwise_affine (bool): If True, apply learned scale and shift parameters to the output. Default is True. + """ self.normalized_shape = ( (normalized_shape,) if isinstance(normalized_shape, int) @@ -279,6 +511,19 @@ class LayerNorm: ) def __call__(self, x: Tensor): + """ + Normalizes the input tensor using layer normalization. + + Args: + x (Tensor): The input tensor to be normalized. + + Returns: + Tensor: The normalized tensor. If elementwise_affine is True, then the output tensor is multiplied by + learned scale and added to the shift parameter. + + Raises: + AssertionError: If the last dimensions of x do not match normalized_shape. + """ assert ( self.normalized_shape == x.shape[-len(self.normalized_shape) :] ), f"last dimensions of {x.shape} must match {self.normalized_shape}" @@ -289,16 +534,72 @@ class LayerNorm: class LayerNorm2d(LayerNorm): + """ + LayerNorm2d class for 2D Layer Normalization. + + This class is a subclass of the LayerNorm class and is used to normalize the input tensor x in the __call__ method. + + Attributes: + x (Tensor): The input tensor to be normalized. + """ + def __call__(self, x): + """ + Normalize the input tensor x. + + This method is used to normalize the input tensor x by permuting its dimensions and calling the parent class's + __call__ method on the result. The dimensions are then permuted back for the output. + + Args: + x (Tensor): The input tensor to be normalized. + + Returns: + Tensor: The normalized tensor after permuting its dimensions back. + """ return super().__call__(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) class Embedding: + """ + Embedding class for word embeddings. + + This class is used to convert input indices into their corresponding weighted vectors. It has methods for initializing + the weights and performing the embedding lookups. + + Attributes: + vocab_size (int): The size of the vocabulary. + embed_size (int): The size of the embedding vectors. + weight (Tensor): The tensor containing the embedding weights. + """ + def __init__(self, vocab_size: int, embed_size: int): + """ + Initialize the Embedding object. + + This method initializes the vocab_size and embed_size attributes, as well as the weight tensor that holds the + embedding weights. The weight tensor is initialized using Glorot uniform initialization. + + Args: + vocab_size (int): The size of the vocabulary. + embed_size (int): The size of the embedding vectors. + """ self.vocab_size = vocab_size self.weight = Tensor.glorot_uniform(vocab_size, embed_size) def __call__(self, idx: Tensor) -> Tensor: + """ + Perform the embedding lookup for the input indices. + + This method performs an embedding lookup on the input tensor idx by creating a binary matrix that has ones at + positions where the vocabulary index matches the corresponding index in idx, and zeros elsewhere. This matrix is + then used to select weight vectors from the weight tensor using matrix multiplication. + + Args: + idx (Tensor): The input tensor of indices to be looked up in the embedding. + + Returns: + Tensor: The output tensor after performing the embedding lookup. + """ if not hasattr(self, "vocab_counter"): self.vocab_counter = Tensor.arange( self.vocab_size, requires_grad=False