more tensor docstrings

2023-12-07 09:57:38 -07:00 · 2023-12-07 09:57:38 -07:00 · e76c916978
parent 67a6e588fb
commit e76c916978
1 changed files with 288 additions and 13 deletions
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@ -51,6 +51,7 @@ class Function:
        requires_grad (Union[bool, None]): Indicates whether the output tensor requires gradient computation.
        parents (List[Tensor]): The parent tensors for which gradients can be computed.
    """
    def __init__(self, device: str, *tensors: Tensor):
        self.device = device
        self.needs_input_grad = [t.requires_grad for t in tensors]
@ -113,7 +114,7 @@ import tinygrad.mlops as mlops
 class Tensor:
-    '''
+    """
    This class represents a tensor, which is the fundamental unit of data in tinygrad.
    It can be used for various mathematical operations and machine learning applications.
@ -123,7 +124,8 @@ class Tensor:
        training (ClassVar[bool]): Class variable to track if the tensor is in training mode or not.
        no_grad (ClassVar[bool]): Class variable to track if gradient computation is disabled or not.
        default_type (ClassVar[DType]): Default data type for tensors.
-    '''
+    """
    __slots__ = "lazydata", "requires_grad", "grad", "_ctx"
    __deletable__ = ("_ctx",)
    training: ClassVar[bool] = False
@ -148,7 +150,7 @@ class Tensor:
        dtype: Optional[DType] = None,
        requires_grad: Optional[bool] = None,
    ):
-        '''
+        """
        Constructs a new tensor from the given data with the specified device and data type.
        Args:
@ -156,7 +158,7 @@ class Tensor:
            device (Optional[str]): Device where the tensor will be stored.
            dtype (Optional[DType]): Data type of the tensor.
            requires_grad (Optional[bool]): Flag indicating if gradient computation is required or not.
-        '''
+        """
        assert dtype is None or isinstance(dtype, DType), f"invalid dtype {dtype}"
        device = Device.canonicalize(device)
        # tensors have gradients, buffers do not
@ -820,6 +822,7 @@ class Tensor:
        Returns:
            List[Tensor]: A list of tensors in topological order (deepest first).
        """
        def _deepwalk(node, visited, nodes):
            visited.add(node)
            if getattr(node, "_ctx", None):
@ -1043,6 +1046,7 @@ class Tensor:
        Returns:
            Tensor: The tensor item corresponding to the given index or indices.
        """
        def normalize_int(e, i, dim_sz):
            """
            Normalize an integer index based on its dimension size.
@ -1055,6 +1059,7 @@ class Tensor:
            Returns:
                int: The normalized integer index.
            """
        def normalize_int(e, i, dim_sz):
            """
            Normalize an integer index based on its dimension size.
@ -1937,12 +1942,12 @@ class Tensor:
    def max_pool2d(self, kernel_size=(2, 2), stride=None, dilation=1):
        """
        Perform a max pooling operation on the input tensor.
-    
+
        Attributes:
            kernel_size (tuple): The size of the sliding window for each dimension of the input tensor. Default is (2, 2).
            stride (tuple or None): The stride of the sliding window for each dimension of the input tensor. If not provided, it defaults to be the same as kernel_size.
            dilation (int): The spacing between the kernel points. Default is 1.
-    
+
        Returns:
            Tensor: The max pooled tensor.
        """
@ -2106,9 +2111,9 @@ class Tensor:
        def apply_matrix(mat, t, dim=0):
            """
            Apply a 3x3 matrix to a 4x4 matrix in Winograd's F(4x4,3x3) algorithm.
-        
+
            This method is used for applying a 3x3 matrix to a 4x4 matrix as part of the Winograd F(4x4,3x3) convolution algorithm. The function recursively applies the transformation until it reaches the specified dimension.
-        
+
            :param mat: A list of lists representing the 3x3 matrix.
            :type mat: List[List[int]]
            :param t: A tensor to which the matrix will be applied.
@ -2117,7 +2122,7 @@ class Tensor:
            :type dim: int, optional
            :return: The transformed tensor after applying the matrix.
            :rtype: Tensor
-        
+
            Attributes:
                HWI (tuple): A tuple representing the input size of Winograd's F(4x4,3x3) algorithm. Default is (6,).
                HWO (tuple): A tuple representing the output size of Winograd's F(4x4,3x3) algorithm. Default is (4,).
@ -2299,12 +2304,12 @@ class Tensor:
        def fix(x: Tensor):
            """
            Fix tensor by reshaping and transposing it.
-        
+
            This function takes a tensor x as input, reshapes it based on the dimensions of 'ret'
            except for the last two dimensions, multiplies these dimensions together with the product
            of the last two dimensions of 'ret', and finally transposes the tensor based on the axis
            dimension.
-        
+
            :param x: The input tensor to be fixed.
            :type x: Tensor
            :return: The reshaped, sliced, and transposed tensor.
@ -2612,7 +2617,7 @@ class Tensor:
    def sign(self):
        """
        Calculate and return the element-wise sign of the tensor.
-        
+
        For each element in the tensor, this function determines if it is positive or negative and assigns 1 to positive elements and -1 to negative elements. The result is returned as a new tensor with the same shape as the original tensor.
        Returns:
@ -2623,7 +2628,7 @@ class Tensor:
    def reciprocal(self):
        """
        Calculate and return the element-wise reciprocal of the tensor.
-        
+
        For each element in the tensor, this function calculates its reciprocal (1 divided by the element value). The result is returned as a new tensor with the same shape as the original tensor.
        Returns:
@ -2634,45 +2639,250 @@ class Tensor:
    # ***** activation functions (unary) *****
    def elu(self, alpha=1.0):
        """
        Calculate the Exponential Linear Unit (ELU) activation function.
        This method calculates the ELU function for each element in `self`. The ELU function is defined as:
            f(x) = max(0, x) - alpha * exp(-x)  if x <= 0
            f(x) = x                           if x > 0
        Parameters:
            alpha (float): A scaling factor for the negative part of the function, default is 1.0.
        Returns:
            ndarray: The transformed array after applying the ELU function element-wise.
        Attributes:
            relu (method): A method that applies the Rectified Linear Unit (ReLU) function to the data in `self`. ReLU replaces all negative values with zero and keeps positive values unchanged.
            exp (method): A method that computes the exponential of all elements in `self`. The exponential is applied element-wise.
        """
        return self.relu() - alpha * (1 - self.exp()).relu()
    def celu(self, alpha=1.0):
        """
        Calculate the Continuously Differentiable Exponential Linear Unit (C-ELU) activation function.
        This method calculates the C-ELU function for each element in `self`. The C-ELU function is defined as:
            f(x) = max(0, x) + alpha * exp(-x / alpha)  if x <= 0
            f(x) = x                                    if x > 0
        Parameters:
            alpha (float): A scaling factor for the negative part of the function, default is 1.0.
        Returns:
            ndarray: The transformed array after applying the C-ELU function element-wise.
        Attributes:
            maximum (method): A method that takes the element-wise maximum of `self` and another array or scalar.
            exp (method): A method that computes the exponential of all elements in `self`. The exponential is applied element-wise.
            minimum (method): A method that takes the element-wise minimum of `self` and another array or scalar.
        """
        return self.maximum(0) + (alpha * ((self / alpha).exp() - 1)).minimum(0)
    def swish(self):
        """
        Calculate the Swish activation function.
        This method calculates the Swish function for each element in `self`. The Swish function is defined as:
            f(x) = x * sigmoid(x)
        Returns:
            ndarray: The transformed array after applying the Swish function element-wise.
        Attributes:
            sigmoid (method): A method that applies the Sigmoid function to the data in `self`. The Sigmoid function is defined as:
                            f(x) = 1 / (1 + exp(-x))
        """
        return self * self.sigmoid()
    def silu(self):
        """
        Calculate the Sigmoid Weighted Linear Unit (SiLU) activation function, also known as the swish function.
        This method calculates the SiLU function for each element in `self` using the Swish function. The SiLU function is defined as:
            f(x) = x * sigmoid(x)
        Returns:
            ndarray: The transformed array after applying the SiLU function element-wise.
        Attributes:
            swish (method): A method that applies the Swish function to the data in `self`. The Swish function is defined as:
                            f(x) = x * sigmoid(x)
        """
        return self.swish()  # The SiLU function is also known as the swish function.
    def relu6(self):
        """
        Calculate the Rectified Linear Unit 6 (ReLU6) activation function.
        This method calculates the ReLU6 function for each element in `self`. The ReLU6 function is defined as:
            f(x) = min(max(0, x), 6)
        Returns:
            ndarray: The transformed array after applying the ReLU6 function element-wise.
        Attributes:
            relu (method): A method that applies the Rectified Linear Unit (ReLU) function to the data in `self`. The ReLU function is defined as:
                        f(x) = max(0, x)
        """
        return self.relu() - (self - 6).relu()
    def hardswish(self):
        """
        Calculate the Hard Swish activation function.
        This method calculates the Hard Swish function for each element in `self`. The Hard Swish function is defined as:
            f(x) = x * (((x + 3) min 6) max 0) / 6
        Returns:
            ndarray: The transformed array after applying the Hard Swish function element-wise.
        Attributes:
            relu6 (method): A method that applies the Rectified Linear Unit 6 (ReLU6) function to the data in `self`. The ReLU6 function is defined as:
                            f(x) = min(max(0, x), 6)
        """
        return self * (self + 3).relu6() * (1 / 6)
    def tanh(self):
        """
        Calculate the Hyperbolic Tangent (tanh) activation function.
        This method calculates the tanh function for each element in `self`. The tanh function is defined as:
            f(x) = 2 * sigmoid(2 * x) - 1
        Returns:
            ndarray: The transformed array after applying the tanh function element-wise.
        Attributes:
            sigmoid (method): A method that applies the Sigmoid function to the data in `self`. The Sigmoid function is defined as:
                            f(x) = 1 / (1 + exp(-x))
        """
        return 2.0 * ((2.0 * self).sigmoid()) - 1.0
    def sinh(self):
        """
        Calculate the Hyperbolic Sine (sinh) activation function.
        This method calculates the sinh function for each element in `self`. The sinh function is defined as:
            f(x) = (exp(x) - exp(-x)) / 2
        Returns:
            ndarray: The transformed array after applying the sinh function element-wise.
        Attributes:
            exp (method): A method that applies the Exponential function to the data in `self`. The Exponential function is defined as:
                        f(x) = e^x
            neg (method): A method that applies the Negation operation to the data in `self`. The Negation operation returns an element-wise negative of `self`.
        """
        return (self.exp() - self.neg().exp()) / 2
    def cosh(self):
        """
        Calculate the Hyperbolic Cosine (cosh) activation function.
        This method calculates the cosh function for each element in `self`. The cosh function is defined as:
            f(x) = (exp(x) + exp(-x)) / 2
        Returns:
            ndarray: The transformed array after applying the cosh function element-wise.
        Attributes:
            exp (method): A method that applies the Exponential function to the data in `self`. The Exponential function is defined as:
                        f(x) = e^x
            neg (method): A method that applies the Negation operation to the data in `self`. The Negation operation returns an element-wise negative of `self`.
        """
        return (self.exp() + self.neg().exp()) / 2
    def atanh(self):
        """
        Calculate the Inverse Hyperbolic Tangent (atanh) activation function.
        This method calculates the atanh function for each element in `self`. The atanh function is defined as:
            f(x) = log((1 + x) / (1 - x)) / 2
        Returns:
            ndarray: The transformed array after applying the atanh function element-wise.
        Attributes:
            log (method): A method that applies the Natural Logarithm function to the data in `self`. The Natural Logarithm function is defined as:
                        f(x) = ln(x)
        """
        return ((1 + self) / (1 - self)).log() / 2
    def asinh(self):
        """
        Calculate the Inverse Hyperbolic Sine (asinh) activation function.
        This method calculates the asinh function for each element in `self`. The asinh function is defined as:
            f(x) = log(x + sqrt(1 + x^2))
        Returns:
            ndarray: The transformed array after applying the asinh function element-wise.
        Attributes:
            log (method): A method that applies the Natural Logarithm function to the data in `self`. The Natural Logarithm function is defined as:
                        f(x) = ln(x)
            square (method): A method that squares each element in `self`. The Square operation returns an element-wise square of `self`.
            sqrt (method): A method that applies the Square Root function to the data in `self`. The Square Root function is defined as:
                        f(x) = sqrt(x)
        """
        return (self + (self.square() + 1).sqrt()).log()
    def acosh(self):
        """
        Calculate the Inverse Hyperbolic Cosine (acosh) activation function.
        This method calculates the acosh function for each element in `self`. The acosh function is defined as:
            f(x) = log(x + sqrt((x - 1)(x + 1)))
        Returns:
            ndarray: The transformed array after applying the acosh function element-wise.
        Attributes:
            log (method): A method that applies the Natural Logarithm function to the data in `self`. The Natural Logarithm function is defined as:
                        f(x) = ln(x)
            square (method): A method that squares each element in `self`. The Square operation returns an element-wise square of `self`.
            sqrt (method): A method that applies the Square Root function to the data in `self`. The Square Root function is defined as:
                        f(x) = sqrt(x)
        """
        return (self + (self.square() - 1).sqrt()).log()
    def hardtanh(self, min_val=-1, max_val=1):
        """
        Apply the HardTanh activation function.
        This method applies the HardTanh function to each element in `self`. The HardTanh function is defined as:
            f(x) = max_val if x > max_val
                = min_val if x < min_val
                = x otherwise
        Args:
            min_val (float): The minimum value of the output range. Defaults to -1.
            max_val (float): The maximum value of the output range. Defaults to 1.
        Returns:
            ndarray: The transformed array after applying the HardTanh function element-wise.
        Attributes:
            clip(method): A method that clips `self` to a specified range [min_val, max_val]. If an element in `self` is less than min_val, it is set to min_val. 
                        If an element is greater than max_val, it is set to max_val. The clip operation does not modify elements that are within the range [min_val, max_val].
        """
        return self.clip(min_val, max_val)
    def gelu(self):
        """
        Apply the Gaussian Error Linear Unit (GELU) activation function.
        This method applies the GELU function to each element in `self`. The GELU function is defined as:
            f(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
        Returns:
            ndarray: The transformed array after applying the GELU function element-wise.
        Attributes:
            tanh (method): A method that applies the Hyperbolic Tangent function to the data in `self`. The Hyperbolic Tangent function is defined as:
                        f(x) = tanh(x)
        """
        return (
            0.5
            * self
@ -2680,18 +2890,83 @@ class Tensor:
        )
    def quick_gelu(self):
        """
        Apply a faster approximation of Gaussian Error Linear Unit (GELU) activation function.
        This method applies an approximate GELU function to each element in `self`. The approximate GELU function is defined as:
            f(x) = x * sigmoid(x * 1.702)
        Returns:
            ndarray: The transformed array after applying the approximate GELU function element-wise.
        Attributes:
            sigmoid (method): A method that applies the Sigmoid function to the data in `self`. The Sigmoid function is defined as:
                            f(x) = 1 / (1 + exp(-x))
        """
        return self * (self * 1.702).sigmoid()
    def leakyrelu(self, neg_slope=0.01):
        """
        Apply the Leaky ReLU activation function.
        This method applies the Leaky ReLU function to each element in `self`. The Leaky ReLU function is defined as:
            f(x) = max(x, neg_slope * x)
        Args:
            neg_slope (float): The negative slope parameter for the Leaky ReLU function. Default is 0.01.
        Returns:
            ndarray: The transformed array after applying the Leaky ReLU function element-wise.
        Attributes:
            relu (method): A method that applies the Rectified Linear Unit (ReLU) function to the data in `self`. The ReLU function is defined as:
                        f(x) = max(0, x)
        """
        return self.relu() - (-neg_slope * self).relu()
    def mish(self):
        """
        Apply the Mish activation function.
        This method applies the Mish function to each element in `self`. The Mish function is defined as:
            f(x) = x * tanh(softplus(x))
        Returns:
            ndarray: The transformed array after applying the Mish function element-wise.
        Attributes:
            softplus (method): A method that applies the Softplus function to the data in `self`. The Softplus function is defined as:
                                f(x) = log(1 + exp(x))
            tanh (method): A method that applies the hyperbolic tangent function to the data in `self`. The hyperbolic tangent function is defined as:
                        f(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
        """
        return self * self.softplus().tanh()
    def softplus(self, beta=1):
        """
        Apply the Softplus function.
        This method applies the Softplus function to each element in `self`. The Softplus function is defined as:
            f(x) = (1/beta) * log(1 + exp(beta * x))
        Args:
            beta (float): The beta parameter for the Softplus function. Default is 1.
        Returns:
            ndarray: The transformed array after applying the Softplus function element-wise.
        """
        return (1 / beta) * (1 + (self * beta).exp()).log()
    def softsign(self):
        """
        Apply the Softsign function.
        This method applies the Softsign function to each element in `self`. The Softsign function is defined as:
            f(x) = x / (1 + |x|)
        Returns:
            ndarray: The transformed array after applying the Softsign function element-wise.
        """
        return self / (1 + self.abs())
    # ***** broadcasted binary mlops *****