nn init docstrings

2023-12-06 14:45:18 -07:00 · 2023-12-06 14:45:18 -07:00 · 35b37b1a5a
parent 45dbac0a02
commit 35b37b1a5a
1 changed files with 301 additions and 0 deletions
--- a/tinygrad/nn/init.py
+++ b/tinygrad/nn/init.py
@ -6,6 +6,17 @@ from tinygrad.nn import optim, state  # noqa: F401


 class BatchNorm2d:
+    """
+    This class is for 2D batch normalization. It scales and shifts the input tensor x using the calculated mean and variance.
+
+    Attributes:
+        sz (int): The size of the tensor.
+        eps (float): A small constant added to the variance to prevent division by zero. Default is 1e-5.
+        affine (bool): A boolean value that when set to True, the constructor learns and applies scale and bias. Default is True.
+        track_running_stats (bool): A boolean value that when set to True, this module tracks the running mean and variance. Default is True.
+        momentum (float): The value used for the moving average of the mean and variance. Default is 0.1.
+    """
+
    def __init__(
        self, sz: int, eps=1e-5, affine=True, track_running_stats=True, momentum=0.1
    ):
@ -26,6 +37,15 @@ class BatchNorm2d:
        self.num_batches_tracked = Tensor.zeros(1, requires_grad=False)

    def __call__(self, x: Tensor):
+        """
+        This function performs the forward pass of the batch normalization layer.
+
+        Args:
+            x (Tensor): The input tensor to be normalized.
+
+        Returns:
+            Tensor: The normalized output tensor.
+        """
        if Tensor.training:
            # This requires two full memory accesses to x
            # https://github.com/pytorch/pytorch/blob/c618dc13d2aa23625cb0d7ada694137532a4fa33/aten/src/ATen/native/cuda/Normalization.cuh
@ -73,6 +93,34 @@ def Conv1d(
    groups=1,
    bias=True,
 ):
+    """
+    Function Signature: Conv1d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True)
+
+    **Description**:
+    This function is a wrapper for the PyTorch convolutional layer (Conv2d). It takes in parameters like in_channels, out_channels, kernel_size, stride, padding, dilation, groups, and bias. The kernel_size is automatically converted into a tuple of size 1 for the Conv2d function.
+
+    **Parameters**:
+
+    :param in_channels: Number of channels in the input image
+    :type in_channels: int
+    :param out_channels: Number of channels produced by the convolution
+    :type out_channels: int
+    :param kernel_size: Size of the convolving kernel
+    :type kernel_size: int
+    :param stride: Stride of the convolution, default is 1
+    :type stride: int, optional
+    :param padding: Zero-padding added to both sides of the input, default is 0
+    :type padding: int, optional
+    :param dilation: Spacing between the kernel points, default is 1
+    :type dilation: int, optional
+    :param groups: Number of blocked connections from input channels to output channels, default is 1
+    :type groups: int, optional
+    :param bias: If True, adds a learnable bias to the output, default is True
+    :type bias: bool, optional
+
+    **Returns**:
+    The Conv2d layer with converted parameters
+    """
    return Conv2d(
        in_channels,
        out_channels,
@ -86,6 +134,20 @@ def Conv1d(


 class Conv2d:
+    """
+    This class defines a 2D convolutional layer.
+
+    Attributes:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        kernel_size (tuple): Size of the convolving kernel.
+        stride (int): Stride of the convolution. Default is 1.
+        padding (int): Padding added to both sides of the input. Default is 0.
+        dilation (int): Spacing between kernel elements. Default is 1.
+        groups (int): Number of blocked connections from input channels to output channels. Default is 1.
+        bias (bool): If True, adds a learnable bias to the output. Default is True.
+    """
+
    def __init__(
        self,
        in_channels,
@ -116,6 +178,15 @@ class Conv2d:
        )

    def __call__(self, x: Tensor):
+        """
+        Forward pass through the layer.
+
+        Args:
+            x (Tensor): Input tensor of shape (batch_size, in_channels, height, width).
+
+        Returns:
+            Tensor: Output tensor after convolution of shape (batch_size, out_channels, new_height, new_width).
+        """
        return x.conv2d(
            self.weight,
            self.bias,
@ -126,6 +197,17 @@ class Conv2d:
        )

    def initialize_weight(self, out_channels, in_channels, groups):
+        """
+        Initialize the weight tensor using the Kaiming uniform initialization method.
+
+        Args:
+            out_channels (int): Number of output channels.
+            in_channels (int): Number of input channels per group.
+            groups (int): Number of blocked connections from input channels to output channels.
+
+        Returns:
+            Tensor: Initialized weight tensor of shape (out_channels, in_channels // groups, *kernel_size).
+        """
        return Tensor.kaiming_uniform(
            out_channels, in_channels // groups, *self.kernel_size, a=math.sqrt(5)
        )
@ -142,6 +224,36 @@ def ConvTranspose1d(
    groups=1,
    bias=True,
 ):
+    """
+    Function Signature: ConvTranspose1d(in_channels, out_channels, kernel_size, stride=1, padding=0, output_padding=0, dilation=1, groups=1, bias=True)
+
+    **Description**:
+        This function creates a 1D convolutional transpose layer.
+
+    **Parameters**:
+        Attributes:
+            in_channels : int
+                Number of input channels.
+            out_channels : int
+                Number of output channels.
+            kernel_size : int
+                Size of the convolving kernel.
+            stride : int, default=1
+                Stride of the convolution.
+            padding : int, default=0
+                Zero-padding added to both sides of the input.
+            output_padding : int, default=0
+                Additional size added to one side of the output shape.
+            dilation : int, default=1
+                Spacing between kernel elements.
+            groups : int, default=1
+                Number of blocked connections from input channels to output channels.
+            bias : bool, default=True
+                If True, adds a learnable bias to the output.
+
+    **Returns**:
+        ConvTranspose2d object with specified attributes.
+    """
    return ConvTranspose2d(
        in_channels,
        out_channels,
@ -156,6 +268,21 @@ def ConvTranspose1d(


 class ConvTranspose2d(Conv2d):
+    """
+    This class defines a 2D transposed convolution layer, often used in image segmentation tasks.
+
+    Attributes:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        kernel_size (tuple): The size of the convolving kernel.
+        stride (int): Stride of the convolution. Defaults to 1.
+        padding (int): Zero-padding added to both sides of the input. Defaults to 0.
+        output_padding (int): Additional size added to one side of the output shape. Defaults to 0.
+        dilation (int): Spacing between kernel elements. Defaults to 1.
+        groups (int): Number of blocked connections from input channels to output channels. Defaults to 1.
+        bias (bool): If True, adds a learnable bias to the output. Defaults to True.
+    """
+
    def __init__(
        self,
        in_channels,
@ -181,6 +308,17 @@ class ConvTranspose2d(Conv2d):
        self.output_padding = output_padding

    def __call__(self, x: Tensor):
+        """
+        Forward pass method for the ConvTranspose2d layer.
+
+        Args:
+            x (Tensor): Input tensor of shape (batch_size, in_channels, height, width).
+
+        Returns:
+            Tensor: Output tensor of shape (batch_size, out_channels, height', width'), where height' and width' are computed by:
+                output_height = (height - 1) * stride + kernel_size - 2 * padding + output_padding
+                output_width = (width - 1) * stride + kernel_size - 2 * padding + output_padding
+        """
        return x.conv_transpose2d(
            self.weight,
            self.bias,
@ -192,13 +330,43 @@ class ConvTranspose2d(Conv2d):
        )

    def initialize_weight(self, out_channels, in_channels, groups):
+        """
+        Initializes the weight tensor using the Kaiming uniform initialization method.
+
+        Args:
+            out_channels (int): The number of output channels.
+            in_channels (int): The number of input channels.
+            groups (int): Number of blocked connections from input channels to output channels.
+
+        Returns:
+            Tensor: Initialized weight tensor with shape (out_channels, in_channels // groups, *kernel_size).
+        """
        return Tensor.kaiming_uniform(
            in_channels, out_channels // groups, *self.kernel_size, a=math.sqrt(5)
        )


 class Linear:
+    """
+    A class representing a linear transformation.
+
+    Attributes:
+        in_features (int): The number of input features.
+        out_features (int): The number of output features.
+        bias (bool, optional): If True, adds a learnable bias to the output. Defaults to True.
+
+    """
+
    def __init__(self, in_features, out_features, bias=True):
+        """
+        Initializes the Linear class with the given input and output features.
+
+        Args:
+            in_features (int): The number of input features.
+            out_features (int): The number of output features.
+            bias (bool, optional): If True, adds a learnable bias to the output. Defaults to True.
+
+        """
        self.weight = Tensor.kaiming_uniform(out_features, in_features, a=math.sqrt(5))
        # TODO: remove this once we can represent Tensor with int shape in typing
        assert isinstance(self.weight.shape[1], int), "does not support symbolic shape"
@ -208,10 +376,30 @@ class Linear:
        )

    def __call__(self, x: Tensor):
+        """
+        Applies the linear transformation to the input tensor.
+
+        Args:
+            x (Tensor): The input tensor.
+
+        Returns:
+            Tensor: The output tensor after applying the linear transformation.
+
+        """
        return x.linear(self.weight.transpose(), self.bias)


 class GroupNorm:
+    """
+    Apply group normalization to the input tensor.
+
+    Attributes:
+        num_groups (int): The number of groups to separate the channels into.
+        num_channels (int): The number of channels in the input tensor.
+        eps (float): A small constant added to the variance to prevent division by zero. Default is 1e-5.
+        affine (bool): A boolean value that when set to True, this module has learnable affine parameters. Default is True.
+    """
+
    def __init__(
        self, num_groups: int, num_channels: int, eps: float = 1e-5, affine: bool = True
    ):
@ -220,6 +408,15 @@ class GroupNorm:
        self.bias: Optional[Tensor] = Tensor.zeros(num_channels) if affine else None

    def __call__(self, x: Tensor):
+        """
+        Normalize the input tensor.
+
+        Args:
+            x (Tensor): The input tensor of shape [batch_size, num_channels, ...].
+
+        Returns:
+            Tensor: The normalized tensor with the same shape as the input tensor.
+        """
        # reshape for layernorm to work as group norm
        # subtract mean and divide stddev
        x = (
@ -237,12 +434,30 @@ class GroupNorm:


 class InstanceNorm:
+    """
+    Class that implements Instance Normalization for tensors.
+
+    Attributes:
+        num_features (int): The number of features in the tensor.
+        eps (float): A small constant added to the standard deviation to avoid division by zero. Default is 1e-5.
+        affine (bool): If True, apply learned scale and shift parameters. Default is True.
+    """
+
    def __init__(self, num_features: int, eps: float = 1e-5, affine: bool = True):
        self.num_features, self.eps = num_features, eps
        self.weight: Optional[Tensor] = Tensor.ones(num_features) if affine else None
        self.bias: Optional[Tensor] = Tensor.zeros(num_features) if affine else None

    def __call__(self, x: Tensor):
+        """
+        Normalize the input tensor using instance normalization.
+
+        Args:
+            x (Tensor): The input tensor to be normalized.
+
+        Returns:
+            Tensor: The normalized tensor. If affine is False, returns the original tensor after normalizing its channels.
+        """
        x = (
            x.reshape(x.shape[0], self.num_features, -1)
            .layernorm(eps=self.eps)
@ -256,12 +471,29 @@ class InstanceNorm:


 class LayerNorm:
+    """
+    Implements layer normalization.
+
+    Attributes:
+        normalized_shape (Union[int, Tuple[int, ...]]): The shape of the input tensor to be normalized.
+        eps (float): A small constant added to the variance to prevent division by zero. Default is 1e-5.
+        elementwise_affine (bool): If True, apply learned scale and shift parameters to the output. Default is True.
+    """
+
    def __init__(
        self,
        normalized_shape: Union[int, Tuple[int, ...]],
        eps: float = 1e-5,
        elementwise_affine: bool = True,
    ):
+        """
+        Initializes the layer normalization instance.
+
+        Args:
+            normalized_shape (Union[int, Tuple[int, ...]]): The shape of the input tensor to be normalized.
+            eps (float): A small constant added to the variance to prevent division by zero. Default is 1e-5.
+            elementwise_affine (bool): If True, apply learned scale and shift parameters to the output. Default is True.
+        """
        self.normalized_shape = (
            (normalized_shape,)
            if isinstance(normalized_shape, int)
@ -279,6 +511,19 @@ class LayerNorm:
        )

    def __call__(self, x: Tensor):
+        """
+        Normalizes the input tensor using layer normalization.
+
+        Args:
+            x (Tensor): The input tensor to be normalized.
+
+        Returns:
+            Tensor: The normalized tensor. If elementwise_affine is True, then the output tensor is multiplied by
+                    learned scale and added to the shift parameter.
+
+        Raises:
+            AssertionError: If the last dimensions of x do not match normalized_shape.
+        """
        assert (
            self.normalized_shape == x.shape[-len(self.normalized_shape) :]
        ), f"last dimensions of {x.shape} must match {self.normalized_shape}"
@ -289,16 +534,72 @@ class LayerNorm:


 class LayerNorm2d(LayerNorm):
+    """
+    LayerNorm2d class for 2D Layer Normalization.
+
+    This class is a subclass of the LayerNorm class and is used to normalize the input tensor x in the __call__ method.
+
+    Attributes:
+        x (Tensor): The input tensor to be normalized.
+    """
+
    def __call__(self, x):
+        """
+        Normalize the input tensor x.
+
+        This method is used to normalize the input tensor x by permuting its dimensions and calling the parent class's
+        __call__ method on the result. The dimensions are then permuted back for the output.
+
+        Args:
+            x (Tensor): The input tensor to be normalized.
+
+        Returns:
+            Tensor: The normalized tensor after permuting its dimensions back.
+        """
        return super().__call__(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)


 class Embedding:
+    """
+    Embedding class for word embeddings.
+
+    This class is used to convert input indices into their corresponding weighted vectors. It has methods for initializing
+    the weights and performing the embedding lookups.
+
+    Attributes:
+        vocab_size (int): The size of the vocabulary.
+        embed_size (int): The size of the embedding vectors.
+        weight (Tensor): The tensor containing the embedding weights.
+    """
+
    def __init__(self, vocab_size: int, embed_size: int):
+        """
+        Initialize the Embedding object.
+
+        This method initializes the vocab_size and embed_size attributes, as well as the weight tensor that holds the
+        embedding weights. The weight tensor is initialized using Glorot uniform initialization.
+
+        Args:
+            vocab_size (int): The size of the vocabulary.
+            embed_size (int): The size of the embedding vectors.
+        """
        self.vocab_size = vocab_size
        self.weight = Tensor.glorot_uniform(vocab_size, embed_size)

    def __call__(self, idx: Tensor) -> Tensor:
+        """
+        Perform the embedding lookup for the input indices.
+
+        This method performs an embedding lookup on the input tensor idx by creating a binary matrix that has ones at
+        positions where the vocabulary index matches the corresponding index in idx, and zeros elsewhere. This matrix is
+        then used to select weight vectors from the weight tensor using matrix multiplication.
+
+        Args:
+            idx (Tensor): The input tensor of indices to be looked up in the embedding.
+
+        Returns:
+            Tensor: The output tensor after performing the embedding lookup.
+        """
        if not hasattr(self, "vocab_counter"):
            self.vocab_counter = Tensor.arange(
                self.vocab_size, requires_grad=False