diff --git a/doc/library/tensor/conv.rst b/doc/library/tensor/conv.rst index 5ee238c265..3d456f28d7 100644 --- a/doc/library/tensor/conv.rst +++ b/doc/library/tensor/conv.rst @@ -1,11 +1,68 @@ -========================================= -:mod:`tensor.conv` -- Tensor Convolutions -========================================= +.. _libdoc_tensor_conv: -.. module:: tensor.conv +========================================================== +:mod:`conv` -- Ops for convolutional neural nets +========================================================== + +.. module:: conv :platform: Unix, Windows - :synopsis: Tensor Convolutions -.. moduleauthor:: LISA, PyMC Developers, PyTensor Developers + :synopsis: ops for signal processing +.. moduleauthor:: LISA + + +The recommended user interface are: + +- :func:`pytensor.tensor.conv.conv2d` for 2d convolution +- :func:`pytensor.tensor.conv.conv3d` for 3d convolution + +Pytensor will automatically use the fastest implementation in many cases. +On the CPU, the implementation is a GEMM based one. + +This auto-tuning has the inconvenience that the first call is much +slower as it tries and times each implementation it has. So if you +benchmark, it is important that you remove the first call from your +timing. + +Implementation Details +====================== + +This section gives more implementation detail. Most of the time you do +not need to read it. Pytensor will select it for you. + + +- Implemented operators for neural network 2D / image convolution: + - :func:`conv.conv2d `. + old 2d convolution. DO NOT USE ANYMORE. + + For each element in a batch, it first creates a + `Toeplitz `_ matrix in a CUDA kernel. + Then, it performs a ``gemm`` call to multiply this Toeplitz matrix and the filters + (hence the name: MM is for matrix multiplication). + It needs extra memory for the Toeplitz matrix, which is a 2D matrix of shape + ``(no of channels * filter width * filter height, output width * output height)``. + + - :func:`CorrMM ` + This is a CPU-only 2d correlation implementation taken from + `caffe's cpp implementation `_. + It does not flip the kernel. + +- Implemented operators for neural network 3D / video convolution: + - :func:`Corr3dMM ` + This is a CPU-only 3d correlation implementation based on + the 2d version (:func:`CorrMM `). + It does not flip the kernel. As it provides a gradient, you can use it as a + replacement for nnet.conv3d. For convolutions done on CPU, + nnet.conv3d will be replaced by Corr3dMM. + + - :func:`conv3d2d ` + Another conv3d implementation that uses the conv2d with data reshaping. + It is faster in some corner cases than conv3d. It flips the kernel. + +.. autofunction:: pytensor.tensor.conv.conv2d +.. autofunction:: pytensor.tensor.conv.conv2d_transpose +.. autofunction:: pytensor.tensor.conv.conv3d +.. autofunction:: pytensor.tensor.conv.conv3d2d.conv3d +.. autofunction:: pytensor.tensor.conv.conv.conv2d -.. automodule:: pytensor.tensor.conv +.. automodule:: pytensor.tensor.conv.abstract_conv :members: diff --git a/doc/tutorial/conv_arithmetic.rst b/doc/tutorial/conv_arithmetic.rst new file mode 100644 index 0000000000..14edaf2c0f --- /dev/null +++ b/doc/tutorial/conv_arithmetic.rst @@ -0,0 +1,1080 @@ +.. _conv_arithmetic: + +=============================== +Convolution arithmetic tutorial +=============================== + +.. note:: + + This tutorial is adapted from an existing `convolution arithmetic guide + `_ [#]_, with an added emphasis on + Aesara's interface. + + Also, note that the signal processing community has a different nomenclature + and a well established literature on the topic, but for this tutorial + we will stick to the terms used in the machine learning community. For a + signal processing point of view on the subject, see for instance *Winograd, + Shmuel. Arithmetic complexity of computations. Vol. 33. Siam, 1980*. + +About this tutorial +=================== + +Learning to use convolutional neural networks (CNNs) for the first time is +generally an intimidating experience. A convolutional layer's output shape is +affected by the shape of its input as well as the choice of kernel shape, zero +padding and strides, and the relationship between these properties is not +trivial to infer. This contrasts with fully-connected layers, whose output size +is independent of the input size. Additionally, so-called transposed +convolutional layers (also known as fractionally strided convolutional layers, +or -- wrongly -- as deconvolutions) have been employed in more and more work as +of late, and their relationship with convolutional layers has been explained +with various degrees of clarity. + +The relationship between a convolution operation's input shape, kernel size, +stride, padding and its output shape can be confusing at times. + +The tutorial's objective is threefold: + +* Explain the relationship between convolutional layers and transposed + convolutional layers. +* Provide an intuitive understanding of the relationship between input shape, + kernel shape, zero padding, strides and output shape in convolutional and + transposed convolutional layers. +* Clarify Aesara's API on convolutions. + +Refresher: discrete convolutions +================================ + +The bread and butter of neural networks is *affine transformations*: a +vector is received as input and is multiplied with a matrix to produce an +output (to which a bias vector is usually added before passing the result +through a nonlinearity). This is applicable to any type of input, be it an +image, a sound clip or an unordered collection of features: whatever their +dimensionality, their representation can always be flattened into a vector +before the transformation. + +Images, sound clips and many other similar kinds of data have an intrinsic +structure. More formally, they share these important properties: + +* They are stored as multi-dimensional arrays. +* They feature one or more axes for which ordering matters (e.g., width and + height axes for an image, time axis for a sound clip). +* One axis, called the channel axis, is used to access different views of the + data (e.g., the red, green and blue channels of a color image, or the left + and right channels of a stereo audio track). + +These properties are not exploited when an affine transformation is applied; in +fact, all the axes are treated in the same way and the topological information +is not taken into account. Still, taking advantage of the implicit structure of +the data may prove very handy in solving some tasks, like computer vision and +speech recognition, and in these cases it would be best to preserve it. This is +where discrete convolutions come into play. + +A discrete convolution is a linear transformation that preserves this notion of +ordering. It is sparse (only a few input units contribute to a given output +unit) and reuses parameters (the same weights are applied to multiple locations +in the input). + +Here is an example of a discrete convolution: + +.. figure:: conv_arithmetic_figures/numerical_no_padding_no_strides.* + :figclass: align-center + +The light blue grid is called the *input feature map*. A *kernel* (shaded area) +of value + +.. math:: + + \begin{pmatrix} + 0 & 1 & 2 \\ + 2 & 2 & 0 \\ + 0 & 1 & 2 + \end{pmatrix} + +slides across the input feature map. At each location, the product between each +element of the kernel and the input element it overlaps is computed and the +results are summed up to obtain the output in the current location. The final +output of this procedure is a matrix called *output feature map* (in green). + +This procedure can be repeated using different kernels to form as many output +feature maps (a.k.a. *output channels*) as desired. Note also that to keep the +drawing simple a single input feature map is being represented, but it is not +uncommon to have multiple feature maps stacked one onto another (an example of +this is what was referred to earlier as *channels* for images and sound clips). + +.. note:: + + While there is a distinction between convolution and cross-correlation from + a signal processing perspective, the two become interchangeable when the + kernel is learned. For the sake of simplicity and to stay consistent with + most of the machine learning literature, the term *convolution* will be + used in this tutorial. + +If there are multiple input and output feature maps, the collection of kernels +form a 4D array (``output_channels, input_channels, filter_rows, +filter_columns``). For each output channel, each input channel is convolved with +a distinct part of the kernel and the resulting set of feature maps is summed +elementwise to produce the corresponding output feature map. The result of this +procedure is a set of output feature maps, one for each output channel, that is +the output of the convolution. + + +The convolution depicted above is an instance of a 2-D convolution, but can be +generalized to N-D convolutions. For instance, in a 3-D convolution, the kernel +would be a *cuboid* and would slide across the height, width and depth of the +input feature map. + +The collection of kernels defining a discrete convolution has a shape +corresponding to some permutation of :math:`(n, m, k_1, \ldots, k_N)`, where + +.. math:: + + \begin{split} + n &\equiv \text{number of output feature maps},\\ + m &\equiv \text{number of input feature maps},\\ + k_j &\equiv \text{kernel size along axis $j$}. + \end{split} + +The following properties affect the output size :math:`o_j` of a convolutional +layer along axis :math:`j`: + +* :math:`i_j`: input size along axis :math:`j`, +* :math:`k_j`: kernel size along axis :math:`j`, +* :math:`s_j`: stride (distance between two consecutive positions of the + kernel) along axis :math:`j`, +* :math:`p_j`: zero padding (number of zeros concatenated at the beginning and + at the end of an axis) along axis `j`. + +For instance, here is a :math:`3 \times 3` kernel applied to a +:math:`5 \times 5` input padded with a :math:`1 \times 1` border of zeros using +:math:`2 \times 2` strides: + +.. figure:: conv_arithmetic_figures/numerical_padding_strides.* + :figclass: align-center + +The analysis of the relationship between convolutional layer properties is eased +by the fact that they don't interact across axes, i.e., the choice of kernel +size, stride and zero padding along axis :math:`j` only affects the output size +of axis :math:`j`. Because of that, this section will focus on the following +simplified setting: + +* 2-D discrete convolutions (:math:`N = 2`), +* square inputs (:math:`i_1 = i_2 = i`), +* square kernel size (:math:`k_1 = k_2 = k`), +* same strides along both axes (:math:`s_1 = s_2 = s`), +* same zero padding along both axes (:math:`p_1 = p_2 = p`). + +This facilitates the analysis and the visualization, but keep in mind that the +results outlined here also generalize to the N-D and non-square cases. + +Aesara terminology +================== + +Aesara has its own terminology, which differs slightly from the convolution +arithmetic guide's. Here's a simple conversion table for the two: + ++------------------+----------------------------------------------------------------------------------------------------+ +| Aesara | Convolution arithmetic | ++==================+====================================================================================================+ +| ``filters`` | 4D collection of kernels | ++------------------+----------------------------------------------------------------------------------------------------+ +| ``input_shape`` | (batch size (``b``), input channels (``c``), input rows (``i1``), input columns (``i2``)) | ++------------------+----------------------------------------------------------------------------------------------------+ +| ``filter_shape`` | (output channels (``c1``), input channels (``c2``), filter rows (``k1``), filter columns (``k2``)) | ++------------------+----------------------------------------------------------------------------------------------------+ +| ``border_mode`` | ``'valid'``, ``'half'``, ``'full'`` or (:math:`p_1`, :math:`p_2`) | ++------------------+----------------------------------------------------------------------------------------------------+ +| ``subsample`` | (``s1``, ``s2``) | ++------------------+----------------------------------------------------------------------------------------------------+ + +For instance, the convolution shown above would correspond to the following +Aesara call: + +.. code-block:: python + + output = aesara.tensor.nnet.conv2d( + input, filters, input_shape=(1, 1, 5, 5), filter_shape=(1, 1, 3, 3), + border_mode=(1, 1), subsample=(2, 2)) + +Convolution arithmetic +====================== + +No zero padding, unit strides +----------------------------- + +The simplest case to analyze is when the kernel just slides across every +position of the input (i.e., :math:`s = 1` and :math:`p = 0`). +Here is an example for :math:`i = 4` and :math:`k = 3`: + +.. figure:: conv_arithmetic_figures/no_padding_no_strides.* + :figclass: align-center + +One way of defining the output size in this case is by the number of possible +placements of the kernel on the input. Let's consider the width axis: the kernel +starts on the leftmost part of the input feature map and slides by steps of one +until it touches the right side of the input. The size of the output will be +equal to the number of steps made, plus one, accounting for the initial position +of the kernel. The same logic applies for the height axis. + +More formally, the following relationship can be inferred: + +.. admonition:: Relationship 1 + + For any :math:`i` and :math:`k`, and for :math:`s = 1` and :math:`p = 0`, + + .. math:: + + o = (i - k) + 1. + + This translates to the following Aesara code: + + .. code-block:: python + + output = aesara.tensor.nnet.conv2d( + input, filters, input_shape=(b, c2, i1, i2), filter_shape=(c1, c2, k1, k2), + border_mode=(0, 0), subsample=(1, 1)) + # output.shape[2] == (i1 - k1) + 1 + # output.shape[3] == (i2 - k2) + 1 + +Zero padding, unit strides +-------------------------- + +To factor in zero padding (i.e., only restricting to :math:`s = 1`), let's +consider its effect on the effective input size: padding with :math:`p` zeros +changes the effective input size from :math:`i` to :math:`i + 2p`. In the +general case, Relationship 1 can then be used to infer the following +relationship: + +.. admonition:: Relationship 2 + + For any :math:`i`, :math:`k` and :math:`p`, and for :math:`s = 1`, + + .. math:: + + o = (i - k) + 2p + 1. + + This translates to the following Aesara code: + + .. code-block:: python + + output = aesara.tensor.nnet.conv2d( + input, filters, input_shape=(b, c2, i1, i2), filter_shape=(c1, c2, k1, k2), + border_mode=(p1, p2), subsample=(1, 1)) + # output.shape[2] == (i1 - k1) + 2 * p1 + 1 + # output.shape[3] == (i2 - k2) + 2 * p2 + 1 + +Here is an example for :math:`i = 5`, :math:`k = 4` and :math:`p = 2`: + +.. figure:: conv_arithmetic_figures/arbitrary_padding_no_strides.* + :figclass: align-center + +Special cases +------------- + +In practice, two specific instances of zero padding are used quite extensively +because of their respective properties. Let's discuss them in more detail. + +Half (same) padding +^^^^^^^^^^^^^^^^^^^ +Having the output size be the same as the input size (i.e., :math:`o = i`) can +be a desirable property: + +.. admonition:: Relationship 3 + + For any :math:`i` and for :math:`k` odd (:math:`k = 2n + 1, \quad n \in + \mathbb{N}`), :math:`s = 1` and :math:`p = \lfloor k / 2 \rfloor = n`, + + .. math:: + + \begin{split} + o &= i + 2 \lfloor k / 2 \rfloor - (k - 1) \\ + &= i + 2n - 2n \\ + &= i. + \end{split} + + This translates to the following Aesara code: + + .. code-block:: python + + output = aesara.tensor.nnet.conv2d( + input, filters, input_shape=(b, c2, i1, i2), filter_shape=(c1, c2, k1, k2), + border_mode='half', subsample=(1, 1)) + # output.shape[2] == i1 + # output.shape[3] == i2 + +This is sometimes referred to as *half* (or *same*) padding. Here is an example +for :math:`i = 5`, :math:`k = 3` and (therefore) :math:`p = 1`: + +.. figure:: conv_arithmetic_figures/same_padding_no_strides.* + :figclass: align-center + +Note that half padding also works for even-valued :math:`k` and for :math:`s > +1`, but in that case the property that the output size is the same as the input +size is lost. Some frameworks also implement the ``same`` convolution slightly +differently (e.g., in Keras :math:`o = (i + s - 1) // s`). + +Full padding +^^^^^^^^^^^^ + +While convolving a kernel generally *decreases* the output size with respect to +the input size, sometimes the opposite is required. This can be achieved with +proper zero padding: + +.. admonition:: Relationship 4 + + For any :math:`i` and :math:`k`, and for :math:`p = k - 1` and + :math:`s = 1`, + + .. math:: + + \begin{split} + o &= i + 2(k - 1) - (k - 1) \\ + &= i + (k - 1). + \end{split} + + This translates to the following Aesara code: + + .. code-block:: python + + output = aesara.tensor.nnet.conv2d( + input, filters, input_shape=(b, c2, i1, i2), filter_shape=(c1, c2, k1, k2), + border_mode='full', subsample=(1, 1)) + # output.shape[2] == i1 + (k1 - 1) + # output.shape[3] == i2 + (k2 - 1) + +This is sometimes referred to as *full* padding, because in this setting every +possible partial or complete superimposition of the kernel on the input feature +map is taken into account. Here is an example for :math:`i = 5`, :math:`k = 3` +and (therefore) :math:`p = 2`: + +.. figure:: conv_arithmetic_figures/full_padding_no_strides.* + :figclass: align-center + +No zero padding, non-unit strides +--------------------------------- + +All relationships derived so far only apply for unit-strided convolutions. +Incorporating non unitary strides requires another inference leap. To facilitate +the analysis, let's momentarily ignore zero padding (i.e., :math:`s > 1` and +:math:`p = 0`). Here is an example for :math:`i = 5`, :math:`k = 3` and :math:`s += 2`: + +.. figure:: conv_arithmetic_figures/no_padding_strides.* + :figclass: align-center + +Once again, the output size can be defined in terms of the number of possible +placements of the kernel on the input. Let's consider the width axis: the kernel +starts as usual on the leftmost part of the input, but this time it slides by +steps of size :math:`s` until it touches the right side of the input. The size +of the output is again equal to the number of steps made, plus one, accounting +for the initial position of the kernel. The same logic applies for the height +axis. + +From this, the following relationship can be inferred: + +.. admonition:: Relationship 5 + + For any :math:`i`, :math:`k` and :math:`s`, and for :math:`p = 0`, + + .. math:: + + o = \left\lfloor \frac{i - k}{s} \right\rfloor + 1. + + This translates to the following Aesara code: + + .. code-block:: python + + output = aesara.tensor.nnet.conv2d( + input, filters, input_shape=(b, c2, i1, i2), filter_shape=(c1, c2, k1, k2), + border_mode=(0, 0), subsample=(s1, s2)) + # output.shape[2] == (i1 - k1) // s1 + 1 + # output.shape[3] == (i2 - k2) // s2 + 1 + +The floor function accounts for the fact that sometimes the last +possible step does *not* coincide with the kernel reaching the end of the +input, i.e., some input units are left out. + +Zero padding, non-unit strides +------------------------------ + +The most general case (convolving over a zero padded input using non-unit +strides) can be derived by applying Relationship 5 on an effective input of size +:math:`i + 2p`, in analogy to what was done for Relationship 2: + +.. admonition:: Relationship 6 + + For any :math:`i`, :math:`k`, :math:`p` and :math:`s`, + + .. math:: + + o = \left\lfloor \frac{i + 2p - k}{s} \right\rfloor + 1. + + This translates to the following Aesara code: + + .. code-block:: python + + output = aesara.tensor.nnet.conv2d( + input, filters, input_shape=(b, c2, i1, i2), filter_shape=(c1, c2, k1, k2), + border_mode=(p1, p2), subsample=(s1, s2)) + # output.shape[2] == (i1 - k1 + 2 * p1) // s1 + 1 + # output.shape[3] == (i2 - k2 + 2 * p2) // s2 + 1 + +As before, the floor function means that in some cases a convolution will +produce the same output size for multiple input sizes. More specifically, if +:math:`i + 2p - k` is a multiple of :math:`s`, then any input size :math:`j = i ++ a, \quad a \in \{0,\ldots,s - 1\}` will produce the same output size. Note +that this ambiguity applies only for :math:`s > 1`. + +Here is an example for :math:`i = 5`, :math:`k = 3`, :math:`s = 2` and :math:`p += 1`: + +.. figure:: conv_arithmetic_figures/padding_strides.* + :figclass: align-center + +Here is an example for :math:`i = 6`, :math:`k = 3`, :math:`s = 2` and :math:`p += 1`: + +.. figure:: conv_arithmetic_figures/padding_strides_odd.* + :figclass: align-center + +Interestingly, despite having different input sizes these convolutions share the +same output size. While this doesn't affect the analysis for *convolutions*, +this will complicate the analysis in the case of *transposed convolutions*. + +Transposed convolution arithmetic +================================= + +The need for transposed convolutions generally arises from the desire to use a +transformation going in the opposite direction of a normal convolution, i.e., +from something that has the shape of the output of some convolution to +something that has the shape of its input while maintaining a connectivity +pattern that is compatible with said convolution. For instance, one might use +such a transformation as the decoding layer of a convolutional autoencoder or to +project feature maps to a higher-dimensional space. + +Once again, the convolutional case is considerably more complex than the +fully-connected case, which only requires to use a weight matrix whose shape +has been transposed. However, since every convolution boils down to an +efficient implementation of a matrix operation, the insights gained from the +fully-connected case are useful in solving the convolutional case. + +Like for convolution arithmetic, the dissertation about transposed convolution +arithmetic is simplified by the fact that transposed convolution properties +don't interact across axes. + +This section will focus on the following setting: + +* 2-D transposed convolutions (:math:`N = 2`), +* square inputs (:math:`i_1 = i_2 = i`), +* square kernel size (:math:`k_1 = k_2 = k`), +* same strides along both axes (:math:`s_1 = s_2 = s`), +* same zero padding along both axes (:math:`p_1 = p_2 = p`). + +Once again, the results outlined generalize to the N-D and non-square cases. + +Convolution as a matrix operation +--------------------------------- + +Take for example the convolution presented in the *No zero padding, unit +strides* subsection: + +.. figure:: conv_arithmetic_figures/no_padding_no_strides.* + :figclass: align-center + +If the input and output were to be unrolled into vectors from left to right, top +to bottom, the convolution could be represented as a sparse matrix +:math:`\mathbf{C}` where the non-zero elements are the elements :math:`w_{i,j}` +of the kernel (with :math:`i` and :math:`j` being the row and column of the +kernel respectively): + +.. math:: + + \begin{pmatrix} + w_{0,0} & 0 & 0 & 0 \\ + w_{0,1} & w_{0,0} & 0 & 0 \\ + w_{0,2} & w_{0,1} & 0 & 0 \\ + 0 & w_{0,2} & 0 & 0 \\ + w_{1,0} & 0 & w_{0,0} & 0 \\ + w_{1,1} & w_{1,0} & w_{0,1} & w_{0,0} \\ + w_{1,2} & w_{1,1} & w_{0,2} & w_{0,1} \\ + 0 & w_{1,2} & 0 & w_{0,2} \\ + w_{2,0} & 0 & w_{1,0} & 0 \\ + w_{2,1} & w_{2,0} & w_{1,1} & w_{1,0} \\ + w_{2,2} & w_{2,1} & w_{1,2} & w_{1,1} \\ + 0 & w_{2,2} & 0 & w_{1,2} \\ + 0 & 0 & w_{2,0} & 0 \\ + 0 & 0 & w_{2,1} & w_{2,0} \\ + 0 & 0 & w_{2,2} & w_{2,1} \\ + 0 & 0 & 0 & w_{2,2} \\ + \end{pmatrix}^T + +(*Note: the matrix has been transposed for formatting purposes.*) This linear +operation takes the input matrix flattened as a 16-dimensional vector and +produces a 4-dimensional vector that is later reshaped as the :math:`2 \times 2` +output matrix. + +Using this representation, the backward pass is easily obtained by transposing +:math:`\mathbf{C}`; in other words, the error is backpropagated by multiplying +the loss with :math:`\mathbf{C}^T`. This operation takes a 4-dimensional vector +as input and produces a 16-dimensional vector as output, and its connectivity +pattern is compatible with :math:`\mathbf{C}` by construction. + +Notably, the kernel :math:`\mathbf{w}` defines both the matrices +:math:`\mathbf{C}` and :math:`\mathbf{C}^T` used for the forward and backward +passes. + +Transposed convolution +---------------------- + +Let's now consider what would be required to go the other way around, i.e., map +from a 4-dimensional space to a 16-dimensional space, while keeping the +connectivity pattern of the convolution depicted above. This operation is known +as a *transposed convolution*. + +Transposed convolutions -- also called *fractionally strided convolutions* -- +work by swapping the forward and backward passes of a convolution. One way to +put it is to note that the kernel defines a convolution, but whether it's a +direct convolution or a transposed convolution is determined by how the forward +and backward passes are computed. + +For instance, the kernel :math:`\mathbf{w}` defines a convolution whose forward +and backward passes are computed by multiplying with :math:`\mathbf{C}` and +:math:`\mathbf{C}^T` respectively, but it *also* defines a transposed +convolution whose forward and backward passes are computed by multiplying with +:math:`\mathbf{C}^T` and :math:`(\mathbf{C}^T)^T = \mathbf{C}` respectively. + +.. note:: + + The transposed convolution operation can be thought of as the gradient of + *some* convolution with respect to its input, which is usually how + transposed convolutions are implemented in practice. + + Finally note that it is always possible to implement a transposed + convolution with a direct convolution. The disadvantage is that it usually + involves adding many columns and rows of zeros to the input, resulting in a + much less efficient implementation. + +Building on what has been introduced so far, this section will proceed somewhat +backwards with respect to the convolution arithmetic section, deriving the +properties of each transposed convolution by referring to the direct +convolution with which it shares the kernel, and defining the equivalent direct +convolution. + +No zero padding, unit strides, transposed +----------------------------------------- + +The simplest way to think about a transposed convolution is by computing the +output shape of the direct convolution for a given input shape first, and then +inverting the input and output shapes for the transposed convolution. + +Let's consider the convolution of a :math:`3 \times 3` kernel on a :math:`4 +\times 4` input with unitary stride and no padding (i.e., :math:`i = 4`, +:math:`k = 3`, :math:`s = 1` and :math:`p = 0`). As depicted in the convolution +below, this produces a :math:`2 \times 2` output: + +.. figure:: conv_arithmetic_figures/no_padding_no_strides.* + :figclass: align-center + +The transpose of this convolution will then have an output of shape :math:`4 +\times 4` when applied on a :math:`2 \times 2` input. + +Another way to obtain the result of a transposed convolution is to apply an +equivalent -- but much less efficient -- direct convolution. The example +described so far could be tackled by convolving a :math:`3 \times 3` kernel over +a :math:`2 \times 2` input padded with a :math:`2 \times 2` border of zeros +using unit strides (i.e., :math:`i' = 2`, :math:`k' = k`, :math:`s' = 1` and +:math:`p' = 2`), as shown here: + +.. figure:: conv_arithmetic_figures/no_padding_no_strides_transposed.* + :figclass: align-center + +Notably, the kernel's and stride's sizes remain the same, but the input of the +equivalent (direct) convolution is now zero padded. + +.. note:: + + Although equivalent to applying the transposed matrix, this visualization + adds a lot of zero multiplications in the form of zero padding. This is done + here for illustration purposes, but it is inefficient, and software + implementations will normally not perform the useless zero multiplications. + +One way to understand the logic behind zero padding is to consider the +connectivity pattern of the transposed convolution and use it to guide the +design of the equivalent convolution. For example, the top left pixel of the +input of the direct convolution only contribute to the top left pixel of the +output, the top right pixel is only connected to the top right output pixel, +and so on. + +To maintain the same connectivity pattern in the equivalent convolution it is +necessary to zero pad the input in such a way that the first (top-left) +application of the kernel only touches the top-left pixel, i.e., the padding +has to be equal to the size of the kernel minus one. + +Proceeding in the same fashion it is possible to determine similar observations +for the other elements of the image, giving rise to the following relationship: + +.. admonition:: Relationship 7 + + A convolution described by :math:`s = 1`, :math:`p = 0` and :math:`k` has an + associated transposed convolution described by :math:`k' = k`, :math:`s' = + s` and :math:`p' = k - 1` and its output size is + + .. math:: + + o' = i' + (k - 1). + + In other words, + + .. code-block:: python + + input = aesara.tensor.nnet.abstract_conv.conv2d_grad_wrt_inputs( + output, filters, filter_shape=(c1, c2, k1, k2), border_mode=(0, 0), + subsample=(1, 1)) + # input.shape[2] == output.shape[2] + (k1 - 1) + # input.shape[3] == output.shape[3] + (k2 - 1) + +Interestingly, this corresponds to a fully padded convolution with unit strides. + +Zero padding, unit strides, transposed +-------------------------------------- + +Knowing that the transpose of a non-padded convolution is equivalent to +convolving a zero padded input, it would be reasonable to suppose that the +transpose of a zero padded convolution is equivalent to convolving an input +padded with *less* zeros. + +It is indeed the case, as shown in here for :math:`i = 5`, :math:`k = 4` and +:math:`p = 2`: + +.. figure:: conv_arithmetic_figures/arbitrary_padding_no_strides_transposed.* + :figclass: align-center + +Formally, the following relationship applies for zero padded convolutions: + +.. _Relationship8: + +.. admonition:: Relationship 8 + + A convolution described by :math:`s = 1`, :math:`k` and :math:`p` has an + associated transposed convolution described by :math:`k' = k`, :math:`s' = + s` and :math:`p' = k - p - 1` and its output size is + + .. math:: + + o' = i' + (k - 1) - 2p. + + In other words, + + .. code-block:: python + + input = aesara.tensor.nnet.abstract_conv.conv2d_grad_wrt_inputs( + output, filters, filter_shape=(c1, c2, k1, k2), border_mode=(p1, p2), + subsample=(1, 1)) + # input.shape[2] == output.shape[2] + (k1 - 1) - 2 * p1 + # input.shape[3] == output.shape[3] + (k2 - 1) - 2 * p2 + +Special cases +------------- + +Half (same) padding, transposed +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +By applying the same inductive reasoning as before, it is reasonable to expect +that the equivalent convolution of the transpose of a half padded convolution +is itself a half padded convolution, given that the output size of a half +padded convolution is the same as its input size. Thus the following relation +applies: + +.. admonition:: Relationship 9 + + A convolution described by :math:`k = 2n + 1, \quad n \in \mathbb{N}`, + :math:`s = 1` and :math:`p = \lfloor k / 2 \rfloor = n` has an associated + transposed convolution described by :math:`k' = k`, :math:`s' = s` and + :math:`p' = p` and its output size is + + .. math:: + + \begin{split} + o' &= i' + (k - 1) - 2p \\ + &= i' + 2n - 2n \\ + &= i'. + \end{split} + + In other words, + + .. code-block:: python + + input = aesara.tensor.nnet.abstract_conv.conv2d_grad_wrt_inputs( + output, filters, filter_shape=(c1, c2, k1, k2), border_mode='half', + subsample=(1, 1)) + # input.shape[2] == output.shape[2] + # input.shape[3] == output.shape[3] + +Here is an example for :math:`i = 5`, :math:`k = 3` and (therefore) :math:`p = +1`: + +.. figure:: conv_arithmetic_figures/same_padding_no_strides_transposed.* + :figclass: align-center + +Full padding, transposed +^^^^^^^^^^^^^^^^^^^^^^^^ + +Knowing that the equivalent convolution of the transpose of a non-padded +convolution involves full padding, it is unsurprising that the equivalent of +the transpose of a fully padded convolution is a non-padded convolution: + +.. admonition:: Relationship 10 + + A convolution described by :math:`s = 1`, :math:`k` and :math:`p = k - 1` + has an associated transposed convolution described by :math:`k' = k`, + :math:`s' = s` and :math:`p' = 0` and its output size is + + .. math:: + + \begin{split} + o' &= i' + (k - 1) - 2p \\ + &= i' - (k - 1) + \end{split} + + In other words, + + .. code-block:: python + + input = aesara.tensor.nnet.abstract_conv.conv2d_grad_wrt_inputs( + output, filters, filter_shape=(c1, c2, k1, k2), border_mode='full', + subsample=(1, 1)) + # input.shape[2] == output.shape[2] - (k1 - 1) + # input.shape[3] == output.shape[3] - (k2 - 1) + +Here is an example for :math:`i = 5`, :math:`k = 3` and (therefore) :math:`p = +2`: + +.. figure:: conv_arithmetic_figures/full_padding_no_strides_transposed.* + :figclass: align-center + +No zero padding, non-unit strides, transposed +--------------------------------------------- + +Using the same kind of inductive logic as for zero padded convolutions, one +might expect that the transpose of a convolution with :math:`s > 1` involves an +equivalent convolution with :math:`s < 1`. As will be explained, this is a valid +intuition, which is why transposed convolutions are sometimes called +*fractionally strided convolutions*. + +Here is an example for :math:`i = 5`, :math:`k = 3` and :math:`s = 2`: + +.. figure:: conv_arithmetic_figures/no_padding_strides_transposed.* + :figclass: align-center + +This should help understand what fractional strides involve: zeros +are inserted *between* input units, which makes the kernel move around at +a slower pace than with unit strides. + +.. note:: + + Doing so is inefficient and real-world implementations avoid useless + multiplications by zero, but conceptually it is how the transpose of a + strided convolution can be thought of. + +For the moment, it will be assumed that the convolution is non-padded (:math:`p += 0`) and that its input size :math:`i` is such that :math:`i - k` is a multiple +of :math:`s`. In that case, the following relationship holds: + +.. _Relationship11: + +.. admonition:: Relationship 11 + + A convolution described by :math:`p = 0`, :math:`k` and :math:`s` and whose + input size is such that :math:`i - k` is a multiple of :math:`s`, has an + associated transposed convolution described by :math:`\tilde{i}'`, :math:`k' + = k`, :math:`s' = 1` and :math:`p' = k - 1`, where :math:`\tilde{i}'` is the + size of the stretched input obtained by adding :math:`s - 1` zeros between + each input unit, and its output size is + + .. math:: + + o' = s (i' - 1) + k. + + In other words, + + .. code-block:: python + + input = aesara.tensor.nnet.abstract_conv.conv2d_grad_wrt_inputs( + output, filters, filter_shape=(c1, c2, k1, k2), border_mode=(0, 0), + subsample=(s1, s2)) + # input.shape[2] == s1 * (output.shape[2] - 1) + k1 + # input.shape[3] == s2 * (output.shape[3] - 1) + k2 + +Zero padding, non-unit strides, transposed +------------------------------------------ + +When the convolution's input size :math:`i` is such that :math:`i + 2p - k` is a +multiple of :math:`s`, the analysis can extended to the zero padded case by +combining :ref:`Relationship 8 ` and +:ref:`Relationship 11 `: + +.. admonition:: Relationship 12 + + A convolution described by :math:`k`, :math:`s` and :math:`p` and whose + input size :math:`i` is such that :math:`i + 2p - k` is a multiple of + :math:`s` has an associated transposed convolution described by + :math:`\tilde{i}'`, :math:`k' = k`, :math:`s' = 1` and :math:`p' = k - p - + 1`, where :math:`\tilde{i}'` is the size of the stretched input obtained by + adding :math:`s - 1` zeros between each input unit, and its output size is + + .. math:: + + o' = s (i' - 1) + k - 2p. + + In other words, + + .. code-block:: python + + o_prime1 = s1 * (output.shape[2] - 1) + k1 - 2 * p1 + o_prime2 = s2 * (output.shape[3] - 1) + k2 - 2 * p2 + input = aesara.tensor.nnet.abstract_conv.conv2d_grad_wrt_inputs( + output, filters, input_shape=(b, c1, o_prime1, o_prime2), + filter_shape=(c1, c2, k1, k2), border_mode=(p1, p2), + subsample=(s1, s2)) + +Here is an example for :math:`i = 5`, :math:`k = 3`, :math:`s = 2` and :math:`p += 1`: + +.. figure:: conv_arithmetic_figures/padding_strides_transposed.* + :figclass: align-center + +The constraint on the size of the input :math:`i` can be relaxed by introducing +another parameter :math:`a \in \{0, \ldots, s - 1\}` that allows to distinguish +between the :math:`s` different cases that all lead to the same :math:`i'`: + +.. admonition:: Relationship 13 + + A convolution described by :math:`k`, :math:`s` and :math:`p` has an + associated transposed convolution described by :math:`a`, + :math:`\tilde{i}'`, :math:`k' = k`, :math:`s' = 1` and :math:`p' = k - p - + 1`, where :math:`\tilde{i}'` is the size of the stretched input obtained by + adding :math:`s - 1` zeros between each input unit, and :math:`a = (i + 2p - + k) \mod s` represents the number of zeros added to the top and right edges + of the input, and its output size is + + .. math:: + + o' = s (i' - 1) + a + k - 2p. + + In other words, + + .. code-block:: python + + o_prime1 = s1 * (output.shape[2] - 1) + a1 + k1 - 2 * p1 + o_prime2 = s2 * (output.shape[3] - 1) + a2 + k2 - 2 * p2 + input = aesara.tensor.nnet.abstract_conv.conv2d_grad_wrt_inputs( + output, filters, input_shape=(b, c1, o_prime1, o_prime2), + filter_shape=(c1, c2, k1, k2), border_mode=(p1, p2), + subsample=(s1, s2)) + +Here is an example for :math:`i = 6`, :math:`k = 3`, :math:`s = 2` and :math:`p += 1`: + +.. figure:: conv_arithmetic_figures/padding_strides_odd_transposed.* + :figclass: align-center + +Miscellaneous convolutions +========================== + +Dilated convolutions +-------------------- + +Those familiar with the deep learning literature may have noticed the term +"dilated convolutions" (or "atrous convolutions", from the French expression +*convolutions à trous*) appear in recent papers. Here we attempt to provide an +intuitive understanding of dilated convolutions. For a more in-depth description +and to understand in what contexts they are applied, see `Chen et al. (2014) +`_ [#]_; `Yu and Koltun (2015) +`_ [#]_. + +Dilated convolutions "inflate" the kernel by inserting spaces between the kernel +elements. The dilation "rate" is controlled by an additional hyperparameter +:math:`d`. Implementations may vary, but there are usually :math:`d - 1` spaces +inserted between kernel elements such that :math:`d = 1` corresponds to a +regular convolution. + +To understand the relationship tying the dilation rate :math:`d` and the output +size :math:`o`, it is useful to think of the impact of :math:`d` on the +*effective kernel size*. A kernel of size :math:`k` dilated by a factor +:math:`d` has an effective size + +.. math:: + + \hat{k} = k + (k - 1)(d - 1). + +This can be combined with Relationship 6 to form the following relationship for +dilated convolutions: + +.. admonition:: Relationship 14 + + For any :math:`i`, :math:`k`, :math:`p` and :math:`s`, and for a dilation + rate :math:`d`, + + .. math:: + + o = \left\lfloor \frac{i + 2p - k - (k - 1)(d - 1)}{s} \right\rfloor + 1. + + This translates to the following Aesara code using the ``filter_dilation`` + parameter: + + .. code-block:: python + + output = aesara.tensor.nnet.conv2d( + input, filters, input_shape=(b, c2, i1, i2), filter_shape=(c1, c2, k1, k2), + border_mode=(p1, p2), subsample=(s1, s2), filter_dilation=(d1, d2)) + # output.shape[2] == (i1 + 2 * p1 - k1 - (k1 - 1) * (d1 - 1)) // s1 + 1 + # output.shape[3] == (i2 + 2 * p2 - k2 - (k2 - 1) * (d2 - 1)) // s2 + 1 + +Here is an example for :math:`i = 7`, :math:`k = 3`, :math:`d = 2`, :math:`s = +1` and :math:`p = 0`: + +.. figure:: conv_arithmetic_figures/dilation.* + :figclass: align-center + +.. [#] Dumoulin, Vincent, and Visin, Francesco. "A guide to convolution + arithmetic for deep learning". arXiv preprint arXiv:1603.07285 (2016) +.. [#] Chen, Liang-Chieh, Papandreou, George, Kokkinos, Iasonas, Murphy, Kevin + and Yuille, Alan L. "Semantic image segmentation with deep convolutional + nets and fully connected CRFs". arXiv preprint arXiv:1412.7062 (2014). +.. [#] Yu, Fisher and Koltun, Vladlen. "Multi-scale context aggregation by + dilated convolutions". arXiv preprint arXiv:1511.07122 (2015) + +Grouped Convolutions +-------------------- + +In grouped convolutions with :math:`n` number of groups, the input and kernel +are split by their channels to form :math:`n` distinct groups. Each group +performs convolutions independent of the other groups to give :math:`n` +different outputs. These individual outputs are then concatenated together to give +the final output. A few examples of works using grouped convolutions are `Krizhevsky et al (2012) +`_ [#]_; +`Xie et at (2016) `_ [#]_. + +A special case of grouped convolutions is when :math:`n` equals the number of input +channels. This is called depth-wise convolutions or channel-wise convolutions. +depth-wise convolutions also forms a part of separable convolutions. + +An example to use Grouped convolutions would be: + + .. code-block:: python + + output = aesara.tensor.nnet.conv2d( + input, filters, input_shape=(b, c2, i1, i2), filter_shape=(c1, c2 / n, k1, k2), + border_mode=(p1, p2), subsample=(s1, s2), filter_dilation=(d1, d2), num_groups=n) + # output.shape[0] == b + # output.shape[1] == c1 + # output.shape[2] == (i1 + 2 * p1 - k1 - (k1 - 1) * (d1 - 1)) // s1 + 1 + # output.shape[3] == (i2 + 2 * p2 - k2 - (k2 - 1) * (d2 - 1)) // s2 + 1 + +.. [#] Alex Krizhevsky, Ilya Sutskever, Geoffrey E. Hinton. "ImageNet + Classification with Deep Convolutional Neural Networks". + Advances in Neural Information Processing Systems 25 (NIPS 2012) +.. [#] Saining Xie, Ross Girshick, Piotr Dollár, Zhuowen Tu, Kaiming He. + "Aggregated Residual Transformations for Deep Neural Networks". + arxiv preprint arXiv:1611.05431 (2016). + +Separable Convolutions +---------------------- + +Separable convolutions consists of two consecutive convolution operations. +First is depth-wise convolutions which performs convolutions separately for +each channel of the input. The output of this operation is the given as input +to point-wise convolutions which is a special case of general convolutions with +1x1 filters. This mixes the channels to give the final output. + +As we can see from this diagram, modified from `Vanhoucke(2014)`_ [#]_, depth-wise +convolutions is performed with :math:`c2` single channel depth-wise filters +to give a total of :math:`c2` output channels in the intermediate output where +each channel in the input separately performs convolutions with separate kernels +to give :math:`c2 / n` channels to the intermediate output, where :math:`n` is +the number of input channels. The intermediate output then performs point-wise +convolutions with :math:`c3` 1x1 filters which mixes the channels of the intermediate +output to give the final output. + +.. image:: conv_arithmetic_figures/sep2D.jpg + :align: center + +Separable convolutions is used as follows: + + .. code-block:: python + + output = aesara.tensor.nnet.separable_conv2d( + input, depthwise_filters, pointwise_filters, num_channels = c1, + input_shape=(b, c1, i1, i2), depthwise_filter_shape=(c2, 1, k1, k2), + pointwise_filter_shape=(c3, c2, 1, 1), border_mode=(p1, p2), + subsample=(s1, s2), filter_dilation=(d1, d2)) + # output.shape[0] == b + # output.shape[1] == c3 + # output.shape[2] == (i1 + 2 * p1 - k1 - (k1 - 1) * (d1 - 1)) // s1 + 1 + # output.shape[3] == (i2 + 2 * p2 - k2 - (k2 - 1) * (d2 - 1)) // s2 + 1 + +.. _Vanhoucke(2014): + http://scholar.google.co.in/scholar_url?url=http://vincent.vanhoucke.com/ + publications/vanhoucke-iclr14.pdf&hl=en&sa=X&scisig=AAGBfm0x0bgnudAqSVgZ + ALfu8uPjYOIWwQ&nossl=1&oi=scholarr&ved=0ahUKEwjLreLjr_DVAhULwI8KHWmHAM8QgAMIJigAMAA + +.. [#] Vincent Vanhoucke. "Learning Visual Representations at Scale", + International Conference on Learning Representations(2014). + + +Quick reference +=============== + +.. admonition:: Convolution relationship + + A convolution specified by + + * input size :math:`i`, + * kernel size :math:`k`, + * stride :math:`s`, + * padding size :math:`p`, + + has an output size given by + + .. math:: + + o = \left\lfloor \frac{i + 2p - k}{s} \right\rfloor + 1. + + In Aesara, this translates to + + .. code-block:: python + + output = aesara.tensor.nnet.conv2d( + input, filters, input_shape=(b, c2, i1, i2), filter_shape=(c1, c2, k1, k2), + border_mode=(p1, p2), subsample=(s1, s2)) + # output.shape[2] == (i1 + 2 * p1 - k1) // s1 + 1 + # output.shape[3] == (i2 + 2 * p2 - k2) // s2 + 1 + +.. admonition:: Transposed convolution relationship + + A transposed convolution specified by + + * input size :math:`i`, + * kernel size :math:`k`, + * stride :math:`s`, + * padding size :math:`p`, + + has an output size given by + + .. math:: + + o = s (i - 1) + a + k - 2p, \quad a \in \{0, \ldots, s - 1\} + + where :math:`a` is a user-specified quantity used to distinguish between the + :math:`s` different possible output sizes. + + Unless :math:`s = 1`, Aesara requires that :math:`a` is implicitly passed + via an ``input_shape`` argument. For instance, if :math:`i = 3`, + :math:`k = 4`, :math:`s = 2`, :math:`p = 0` and :math:`a = 1`, then + :math:`o = 2 (3 - 1) + 1 + 4 = 9` and the Aesara code would look like + + .. code-block:: python + + input = aesara.tensor.nnet.abstract_conv.conv2d_grad_wrt_inputs( + output, filters, input_shape=(9, 9), filter_shape=(c1, c2, 4, 4), + border_mode='valid', subsample=(2, 2)) diff --git a/doc/tutorial/conv_arithmetic_figures/arbitrary_padding_no_strides.gif b/doc/tutorial/conv_arithmetic_figures/arbitrary_padding_no_strides.gif new file mode 100644 index 0000000000..9fbbaeaa5b Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/arbitrary_padding_no_strides.gif differ diff --git a/doc/tutorial/conv_arithmetic_figures/arbitrary_padding_no_strides.pdf b/doc/tutorial/conv_arithmetic_figures/arbitrary_padding_no_strides.pdf new file mode 100644 index 0000000000..fd64bfd20d Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/arbitrary_padding_no_strides.pdf differ diff --git a/doc/tutorial/conv_arithmetic_figures/arbitrary_padding_no_strides_transposed.gif b/doc/tutorial/conv_arithmetic_figures/arbitrary_padding_no_strides_transposed.gif new file mode 100644 index 0000000000..0f98e5e6ba Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/arbitrary_padding_no_strides_transposed.gif differ diff --git a/doc/tutorial/conv_arithmetic_figures/arbitrary_padding_no_strides_transposed.pdf b/doc/tutorial/conv_arithmetic_figures/arbitrary_padding_no_strides_transposed.pdf new file mode 100644 index 0000000000..a05052197b Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/arbitrary_padding_no_strides_transposed.pdf differ diff --git a/doc/tutorial/conv_arithmetic_figures/dilation.gif b/doc/tutorial/conv_arithmetic_figures/dilation.gif new file mode 100644 index 0000000000..2595596770 Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/dilation.gif differ diff --git a/doc/tutorial/conv_arithmetic_figures/dilation.pdf b/doc/tutorial/conv_arithmetic_figures/dilation.pdf new file mode 100644 index 0000000000..e3c9489c54 Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/dilation.pdf differ diff --git a/doc/tutorial/conv_arithmetic_figures/full_padding_no_strides.gif b/doc/tutorial/conv_arithmetic_figures/full_padding_no_strides.gif new file mode 100644 index 0000000000..0d01515963 Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/full_padding_no_strides.gif differ diff --git a/doc/tutorial/conv_arithmetic_figures/full_padding_no_strides.pdf b/doc/tutorial/conv_arithmetic_figures/full_padding_no_strides.pdf new file mode 100644 index 0000000000..8dbc6fc4fa Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/full_padding_no_strides.pdf differ diff --git a/doc/tutorial/conv_arithmetic_figures/full_padding_no_strides_transposed.gif b/doc/tutorial/conv_arithmetic_figures/full_padding_no_strides_transposed.gif new file mode 100644 index 0000000000..bd75e5bdaa Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/full_padding_no_strides_transposed.gif differ diff --git a/doc/tutorial/conv_arithmetic_figures/full_padding_no_strides_transposed.pdf b/doc/tutorial/conv_arithmetic_figures/full_padding_no_strides_transposed.pdf new file mode 100644 index 0000000000..2096fb6655 Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/full_padding_no_strides_transposed.pdf differ diff --git a/doc/tutorial/conv_arithmetic_figures/no_padding_no_strides.gif b/doc/tutorial/conv_arithmetic_figures/no_padding_no_strides.gif new file mode 100644 index 0000000000..72649cf853 Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/no_padding_no_strides.gif differ diff --git a/doc/tutorial/conv_arithmetic_figures/no_padding_no_strides.pdf b/doc/tutorial/conv_arithmetic_figures/no_padding_no_strides.pdf new file mode 100644 index 0000000000..79ea24f46f Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/no_padding_no_strides.pdf differ diff --git a/doc/tutorial/conv_arithmetic_figures/no_padding_no_strides_transposed.gif b/doc/tutorial/conv_arithmetic_figures/no_padding_no_strides_transposed.gif new file mode 100644 index 0000000000..e1856b5d03 Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/no_padding_no_strides_transposed.gif differ diff --git a/doc/tutorial/conv_arithmetic_figures/no_padding_no_strides_transposed.pdf b/doc/tutorial/conv_arithmetic_figures/no_padding_no_strides_transposed.pdf new file mode 100644 index 0000000000..4bf9240ca6 Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/no_padding_no_strides_transposed.pdf differ diff --git a/doc/tutorial/conv_arithmetic_figures/no_padding_strides.gif b/doc/tutorial/conv_arithmetic_figures/no_padding_strides.gif new file mode 100644 index 0000000000..03d8efcf41 Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/no_padding_strides.gif differ diff --git a/doc/tutorial/conv_arithmetic_figures/no_padding_strides.pdf b/doc/tutorial/conv_arithmetic_figures/no_padding_strides.pdf new file mode 100644 index 0000000000..b62837c08d Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/no_padding_strides.pdf differ diff --git a/doc/tutorial/conv_arithmetic_figures/no_padding_strides_transposed.gif b/doc/tutorial/conv_arithmetic_figures/no_padding_strides_transposed.gif new file mode 100644 index 0000000000..ab575a3183 Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/no_padding_strides_transposed.gif differ diff --git a/doc/tutorial/conv_arithmetic_figures/no_padding_strides_transposed.pdf b/doc/tutorial/conv_arithmetic_figures/no_padding_strides_transposed.pdf new file mode 100644 index 0000000000..c6c78e7dff Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/no_padding_strides_transposed.pdf differ diff --git a/doc/tutorial/conv_arithmetic_figures/numerical_no_padding_no_strides.gif b/doc/tutorial/conv_arithmetic_figures/numerical_no_padding_no_strides.gif new file mode 100644 index 0000000000..7f46404284 Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/numerical_no_padding_no_strides.gif differ diff --git a/doc/tutorial/conv_arithmetic_figures/numerical_no_padding_no_strides.pdf b/doc/tutorial/conv_arithmetic_figures/numerical_no_padding_no_strides.pdf new file mode 100644 index 0000000000..835bec90fc Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/numerical_no_padding_no_strides.pdf differ diff --git a/doc/tutorial/conv_arithmetic_figures/numerical_padding_strides.gif b/doc/tutorial/conv_arithmetic_figures/numerical_padding_strides.gif new file mode 100644 index 0000000000..f58f35fdd6 Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/numerical_padding_strides.gif differ diff --git a/doc/tutorial/conv_arithmetic_figures/numerical_padding_strides.pdf b/doc/tutorial/conv_arithmetic_figures/numerical_padding_strides.pdf new file mode 100644 index 0000000000..f8a107eeff Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/numerical_padding_strides.pdf differ diff --git a/doc/tutorial/conv_arithmetic_figures/padding_strides.gif b/doc/tutorial/conv_arithmetic_figures/padding_strides.gif new file mode 100644 index 0000000000..d587f220b9 Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/padding_strides.gif differ diff --git a/doc/tutorial/conv_arithmetic_figures/padding_strides.pdf b/doc/tutorial/conv_arithmetic_figures/padding_strides.pdf new file mode 100644 index 0000000000..49ba33997f Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/padding_strides.pdf differ diff --git a/doc/tutorial/conv_arithmetic_figures/padding_strides_odd.gif b/doc/tutorial/conv_arithmetic_figures/padding_strides_odd.gif new file mode 100644 index 0000000000..e28eec844c Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/padding_strides_odd.gif differ diff --git a/doc/tutorial/conv_arithmetic_figures/padding_strides_odd.pdf b/doc/tutorial/conv_arithmetic_figures/padding_strides_odd.pdf new file mode 100644 index 0000000000..82e299b02d Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/padding_strides_odd.pdf differ diff --git a/doc/tutorial/conv_arithmetic_figures/padding_strides_odd_transposed.gif b/doc/tutorial/conv_arithmetic_figures/padding_strides_odd_transposed.gif new file mode 100644 index 0000000000..e8ef8511a5 Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/padding_strides_odd_transposed.gif differ diff --git a/doc/tutorial/conv_arithmetic_figures/padding_strides_odd_transposed.pdf b/doc/tutorial/conv_arithmetic_figures/padding_strides_odd_transposed.pdf new file mode 100644 index 0000000000..c0a0380b55 Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/padding_strides_odd_transposed.pdf differ diff --git a/doc/tutorial/conv_arithmetic_figures/padding_strides_transposed.gif b/doc/tutorial/conv_arithmetic_figures/padding_strides_transposed.gif new file mode 100644 index 0000000000..8e457f5d0e Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/padding_strides_transposed.gif differ diff --git a/doc/tutorial/conv_arithmetic_figures/padding_strides_transposed.pdf b/doc/tutorial/conv_arithmetic_figures/padding_strides_transposed.pdf new file mode 100644 index 0000000000..79bfb921e6 Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/padding_strides_transposed.pdf differ diff --git a/doc/tutorial/conv_arithmetic_figures/same_padding_no_strides.gif b/doc/tutorial/conv_arithmetic_figures/same_padding_no_strides.gif new file mode 100644 index 0000000000..71e296f9be Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/same_padding_no_strides.gif differ diff --git a/doc/tutorial/conv_arithmetic_figures/same_padding_no_strides.pdf b/doc/tutorial/conv_arithmetic_figures/same_padding_no_strides.pdf new file mode 100644 index 0000000000..d1a68d5a7b Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/same_padding_no_strides.pdf differ diff --git a/doc/tutorial/conv_arithmetic_figures/same_padding_no_strides_transposed.gif b/doc/tutorial/conv_arithmetic_figures/same_padding_no_strides_transposed.gif new file mode 100644 index 0000000000..71e296f9be Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/same_padding_no_strides_transposed.gif differ diff --git a/doc/tutorial/conv_arithmetic_figures/same_padding_no_strides_transposed.pdf b/doc/tutorial/conv_arithmetic_figures/same_padding_no_strides_transposed.pdf new file mode 100644 index 0000000000..a9f4a7851e Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/same_padding_no_strides_transposed.pdf differ diff --git a/doc/tutorial/conv_arithmetic_figures/sep2D.jpg b/doc/tutorial/conv_arithmetic_figures/sep2D.jpg new file mode 100644 index 0000000000..e91fd47641 Binary files /dev/null and b/doc/tutorial/conv_arithmetic_figures/sep2D.jpg differ diff --git a/doc/tutorial/index.rst b/doc/tutorial/index.rst index 0482f590d0..709d20599a 100644 --- a/doc/tutorial/index.rst +++ b/doc/tutorial/index.rst @@ -42,6 +42,7 @@ Advanced sparse prng + conv_arithmetic Advanced configuration and debugging ------------------------------------ diff --git a/pytensor/link/jax/dispatch/__init__.py b/pytensor/link/jax/dispatch/__init__.py index 00976f221c..5da81bf80c 100644 --- a/pytensor/link/jax/dispatch/__init__.py +++ b/pytensor/link/jax/dispatch/__init__.py @@ -14,6 +14,7 @@ import pytensor.link.jax.dispatch.scalar import pytensor.link.jax.dispatch.scan import pytensor.link.jax.dispatch.shape +import pytensor.link.jax.dispatch.signal import pytensor.link.jax.dispatch.slinalg import pytensor.link.jax.dispatch.sort import pytensor.link.jax.dispatch.sparse diff --git a/pytensor/link/jax/dispatch/signal/__init__.py b/pytensor/link/jax/dispatch/signal/__init__.py new file mode 100644 index 0000000000..9264ff44bd --- /dev/null +++ b/pytensor/link/jax/dispatch/signal/__init__.py @@ -0,0 +1 @@ +import pytensor.link.jax.dispatch.signal.conv diff --git a/pytensor/link/jax/dispatch/signal/conv.py b/pytensor/link/jax/dispatch/signal/conv.py new file mode 100644 index 0000000000..1c124065e2 --- /dev/null +++ b/pytensor/link/jax/dispatch/signal/conv.py @@ -0,0 +1,14 @@ +import jax + +from pytensor.link.jax.dispatch import jax_funcify +from pytensor.tensor.signal.conv import Conv1d + + +@jax_funcify.register(Conv1d) +def jax_funcify_Conv1d(op, node, **kwargs): + mode = op.mode + + def conv1d(data, kernel): + return jax.numpy.convolve(data, kernel, mode=mode) + + return conv1d diff --git a/pytensor/link/numba/dispatch/__init__.py b/pytensor/link/numba/dispatch/__init__.py index 56a3e2c9b2..1fefb1d06d 100644 --- a/pytensor/link/numba/dispatch/__init__.py +++ b/pytensor/link/numba/dispatch/__init__.py @@ -9,9 +9,11 @@ import pytensor.link.numba.dispatch.random import pytensor.link.numba.dispatch.scan import pytensor.link.numba.dispatch.scalar +import pytensor.link.numba.dispatch.signal import pytensor.link.numba.dispatch.slinalg import pytensor.link.numba.dispatch.sparse import pytensor.link.numba.dispatch.subtensor import pytensor.link.numba.dispatch.tensor_basic + # isort: on diff --git a/pytensor/link/numba/dispatch/signal/__init__.py b/pytensor/link/numba/dispatch/signal/__init__.py new file mode 100644 index 0000000000..db4834d67d --- /dev/null +++ b/pytensor/link/numba/dispatch/signal/__init__.py @@ -0,0 +1 @@ +import pytensor.link.numba.dispatch.signal.conv diff --git a/pytensor/link/numba/dispatch/signal/conv.py b/pytensor/link/numba/dispatch/signal/conv.py new file mode 100644 index 0000000000..b1c63a440c --- /dev/null +++ b/pytensor/link/numba/dispatch/signal/conv.py @@ -0,0 +1,16 @@ +import numpy as np + +from pytensor.link.numba.dispatch import numba_funcify +from pytensor.link.numba.dispatch.basic import numba_njit +from pytensor.tensor.signal.conv import Conv1d + + +@numba_funcify.register(Conv1d) +def numba_funcify_Conv1d(op, node, **kwargs): + mode = op.mode + + @numba_njit + def conv1d(data, kernel): + return np.convolve(data, kernel, mode=mode) + + return conv1d diff --git a/pytensor/tensor/__init__.py b/pytensor/tensor/__init__.py index 88d3f33199..c6b421d003 100644 --- a/pytensor/tensor/__init__.py +++ b/pytensor/tensor/__init__.py @@ -116,6 +116,7 @@ def _get_vector_length_Constant(op: Op | Variable, var: Constant) -> int: # isort: off from pytensor.tensor import linalg from pytensor.tensor import special +from pytensor.tensor import signal # For backward compatibility from pytensor.tensor import nlinalg diff --git a/pytensor/tensor/conv/abstract_conv.py b/pytensor/tensor/conv/abstract_conv.py index 2bcfa0a551..84d5ac3f87 100644 --- a/pytensor/tensor/conv/abstract_conv.py +++ b/pytensor/tensor/conv/abstract_conv.py @@ -675,7 +675,7 @@ def abstract_conv2d( stack of 2D inputs with a set of 2D filters. The implementation is modelled after Convolutional Neural Networks (CNN). - Refer to :func:`nnet.conv2d ` for a more detailed documentation. + Refer to :func:`conv.conv2d ` for a more detailed documentation. """ input = as_tensor_variable(input) diff --git a/tests/tensor/conv/c_code/corr3d_gemm.c b/pytensor/tensor/conv/c_code/corr3d_gemm.c similarity index 100% rename from tests/tensor/conv/c_code/corr3d_gemm.c rename to pytensor/tensor/conv/c_code/corr3d_gemm.c diff --git a/tests/tensor/conv/c_code/corr_gemm.c b/pytensor/tensor/conv/c_code/corr_gemm.c similarity index 100% rename from tests/tensor/conv/c_code/corr_gemm.c rename to pytensor/tensor/conv/c_code/corr_gemm.c diff --git a/pytensor/tensor/conv/conv.py b/pytensor/tensor/conv/conv.py new file mode 100644 index 0000000000..667ae61126 --- /dev/null +++ b/pytensor/tensor/conv/conv.py @@ -0,0 +1,2637 @@ +""" +Contains an Op for convolving input images with a set of filters. This was +developed especially for Convolutional Neural Networks. + +For related ops, including downsampling and subsampling, see +tensor.signal and tensor.signal.pool. + +See especially conv2d(). +""" + + +import logging +import warnings + +import numpy as np + + +try: + from scipy.signal.signaltools import _bvalfromboundary, _valfrommode + from scipy.signal.sigtools import _convolve2d +except ImportError: + from scipy.signal._signaltools import _bvalfromboundary, _valfrommode + from scipy.signal._sigtools import _convolve2d + +import pytensor +from pytensor.graph.basic import Apply +from pytensor.link.c.op import OpenMPOp +from pytensor.tensor import blas +from pytensor.tensor.basic import as_tensor_variable, get_scalar_constant_value +from pytensor.tensor.conv.abstract_conv import ( + get_conv_output_shape, + get_conv_shape_1axis, +) +from pytensor.tensor.exceptions import NotScalarConstantError +from pytensor.tensor.shape import specify_broadcastable +from pytensor.tensor.type import discrete_dtypes, tensor + + +__docformat__ = "restructuredtext en" +_logger = logging.getLogger("pytensor.tensor.conv") + + +def conv2d( + input, + filters, + image_shape=None, + filter_shape=None, + border_mode="valid", + subsample=(1, 1), + **kargs, +): + """Build the symbolic graph for convolving a stack of input images with a set of filters. + + The implementation is modelled after Convolutional Neural Networks + (CNN). It is simply a wrapper to the `ConvOp` but provides a much cleaner + interface. + + This is deprecated. + + Parameters + ---------- + input : symbolic 4D tensor + Mini-batch of feature map stacks, of shape + (batch size, stack size, nb row, nb col) + see the optional parameter image_shape + filters: symbolic 4D tensor + Set of filters used in CNN layer of shape + (nb filters, stack size, nb row, nb col) + see the optional parameter filter_shape + border_mode : {'valid', 'full'} + 'valid'only apply filter to complete patches of the image. Generates + output of shape: image_shape - filter_shape + 1. + 'full' zero-pads image to multiple of filter shape to generate output + of shape: image_shape + filter_shape - 1. + subsample: tuple of len 2 + Factor by which to subsample the output. Also called strides elsewhere. + image_shape: None, tuple/list of len 4 of int, None or Constant variable + The shape of the input parameter. + Optional, used for optimization like loop unrolling + You can put None for any element of the list to tell that this element + is not constant. + filter_shape : None, tuple/list of len 4 of int, None or Constant variable + Optional, used for optimization like loop unrolling + You can put None for any element of the list + to tell that this element is not constant. + kwargs + Kwargs are passed onto ConvOp. Can be used to set the following: + unroll_batch, unroll_kern, unroll_patch, openmp (see ConvOp doc). + + openmp: By default have the same value as + config.openmp. For small image, filter, + batch size, nkern and stack size, it can be + faster to disable manually openmp. A fast and + incomplete test show that with image size + 6x6, filter size 4x4, batch size==1, + n kern==1 and stack size==1, it is faster + to disable it in valid mode. But if we + grow the batch size to 10, it is faster + with openmp on a core 2 duo. + + Returns + ------- + symbolic 4D tensor + Set of feature maps generated by convolutional layer. Tensor is + of shape (batch size, nb filters, output row, output col). + + """ + + warnings.warn( + "pytensor.tensor.nnet.conv.conv2d is deprecated." + " Use pytensor.tensor.nnet.conv2d instead.", + DeprecationWarning, + ) + + # accept Constant value for image_shape and filter_shape. + if image_shape is not None: + image_shape = list(image_shape) + for i in range(len(image_shape)): + if image_shape[i] is not None: + try: + image_shape[i] = get_scalar_constant_value( + as_tensor_variable(image_shape[i]) + ) + except NotScalarConstantError: + raise NotScalarConstantError( + "The convolution need that the shape" + " information are constant values. We got" + " {image_shape[i]} for the image_shape parameter" + ) + assert image_shape[i].dtype in discrete_dtypes + image_shape[i] = int(image_shape[i]) + if filter_shape is not None: + filter_shape = list(filter_shape) + for i in range(len(filter_shape)): + if filter_shape[i] is not None: + try: + filter_shape[i] = get_scalar_constant_value( + as_tensor_variable(filter_shape[i]) + ) + except NotScalarConstantError: + raise NotScalarConstantError( + "The convolution need that the shape" + " information are constant values. We got" + " {filter_shape[i]} for the filter_shape " + "parameter" + ) + assert filter_shape[i].dtype in discrete_dtypes + filter_shape[i] = int(filter_shape[i]) + + if image_shape and filter_shape: + try: + if image_shape[1] is not None and filter_shape[1] is not None: + assert image_shape[1] == filter_shape[1] + except Exception: + print("image ", image_shape, " filters ", filter_shape) + raise + + if filter_shape is not None: + nkern = filter_shape[0] + kshp = filter_shape[2:] + else: + nkern, kshp = None, None + + if image_shape is not None: + bsize = image_shape[0] + imshp = image_shape[1:] + else: + bsize, imshp = None, None + + op = ConvOp( + output_mode=border_mode, + dx=subsample[0], + dy=subsample[1], + imshp=imshp, + kshp=kshp, + nkern=nkern, + bsize=bsize, + **kargs, + ) + + return op(input, filters) + + +class ConvOp(OpenMPOp): + r""" + This Op serves a dual purpose: it can implement a vanilla 2D convolution + (as taught in any signal processing class) or implement the + convolutional layers found in Convolutional Neural Networks. + + In this setting, a set of 3D images is convolved with a set of 3D kernels, + with the particularity that their leading dimensions are of equal length. + Vanilla 2D convolution is treated as a special case of this. + + The input parameter represents a mini-batch of multiple images. Its shape is: + batch size x num. input feature maps x image height x image width + + The kernel parameter represents a set of 3D kernels. Its shape is: + number of filters x num. input images x filter height x filter width + + The output of ConvOp is a 4D tensor, generated as follows: + output[b,k,:,:] = \sum_i input[b,i,:,:] * filter[k,i,:,:] \forall b,k + where b is the mini-batch index, k the filter index and * is the + convolution operator. + + The constructor initializes a ConvOp with given output_mode (full/valid). + All other parameters are optional and are only used to generate more + optimized c code, or to enable graph optimizers to optimally replace the + ConvOp. + + NOTES ON OPTIMIZATION: + There are two types of optimization. The first is the selection of the + fastest algo when bsize and nkern are provided with imshp and kshp. + By default we try to select the fastest version. You can specify it + with the unroll_batch, unroll_kern, and unroll_patch parameter. + + The second type of optimization is hardcoding some dimensions into the + code when all shape are know. + This make a significant difference for the 'full' output_mode. + + Sometimes, the fastest implementation on x86-64 uses + {unroll_batch=4, unroll_kern=4, unroll_patch=False} + with all other shape parameters being provided. + + For optimizing other architectures, see: + Kazushige Goto and Robert A. Van De Geijn, Anatomy of High-Performance + Matrix Multiplication, (mr x nr). ACM Transactions on Mathematical + Software, May 2008. + Figure 12: (mr x nr). For x86 use 2x4, itanium 8x8, etc. + + Parameters + ---------- + output_mode : {'valid', 'full'} + 'valid' gives an output smaller then the image. + 'full' gives an output bigger then the image. + See 'border_mode' in conv2d's doc. + + Optional parameters: (will generate more optimal c code) + + imshp : tuple of len 2 or 3: 2 for 2d image, 3 for a stack of 2d images. + Stacksize, nb image row, nb image col. + kshp : tuple of len 2 + Nb kernel row, nb kernel col. + nkern : int + The number of kernel. + bsize : int + The size of the minibatch. + dx : int + Patch stride rows. + dy : int + Patch stride cols + + Params which select the version of code used: + + unroll_patch : bool + Use a version of c_code that unroll the patch loop that don't + request all shape information to work, but if all shape information + are present, will use it to hardcode the value in the code for + faster code. + unroll_batch : int + Use a version of c_code that unroll the batch (by unroll_batch) + and the nkern (by unroll_kern) loop. The size must by a multiple + of bsize or nkern respectively. + unroll_kern : int + Use a version of c_code that unroll the batch + (by unroll_batch) and the nkern(by unroll_kern) loop. The size + must by a multiple of bsize or nkern respectively. + The 3 following parameters are used internally when we generate + the gradient when dx!=1 or dy!=1. + + imshp_logical + Default None. None value is equivalent to imshp value. + When imshp_logical != imshp, it tell we need to insert 0 in + the image before we do the convolution. For example, when dx==dy==2 + and the image is [[1, 2], [3, 4]], we should make as if the image + was [[1, 0, 2, 0], [0, 0, 0, 0], [3, 0, 4, 0], [0, 0, 0, 0]]. + Our python code insert the zero, but the c code optimize it. + imshp_logical != imshp when taking the grad again the weights or + the image when the output_mode is full and `dx != 1` or `dy != 1`. + kshp_logical + Idem but for kshp and used for the grad again the + weights when the output_mode is valid and `dx != 1` or `dy != 1`. + kshp_logical_top_aligned + Used in the same case. Default to True. + Set to False in the grad again the weight when the + output_mode is full. + + """ + + __attrnames = [ + "imshp", + "kshp", + "nkern", + "bsize", + "dx", + "dy", + "out_mode", + "unroll_batch", + "unroll_kern", + "unroll_patch", + "imshp_logical", + "kshp_logical", + "kshp_logical_top_aligned", + ] + """These attributes uniquely identify the behaviour of this op for + given inputs. Do not set openmp here. + """ + + # the value of speed_unroll_batch_kern,speed_unroll_patch_noshape,speed_unroll_patch_shape + # have bean calculated on maggie36 when their is only 1 session logged on and only this was running. + # It is an Intel(R) Xeon(R) CPU E5430 @ 2.66GHz. It is computer with pytensor/tensor/nnet/tests/speed_test_conv.py + # and took 5 minutes to run. + # TODO: we should compute this table for each computer/os as this can change. + # I saw on one computer that the speed with the shape can be slower than without! + # using the real shape and the same dtype could also help. + + # unroll_batch, unroll_kern, valid time, full time + speed_unroll_batch_kern = [ + (1, 1, 2.4661250114440918, 6.5472931861877441), + (1, 2, 1.5869178771972656, 5.1499760150909424), + (1, 3, 1.4270510673522949, 3.6593470573425293), + (1, 4, 1.3373479843139648, 3.3451821804046631), + (1, 5, 1.2818830013275146, 3.1444568634033203), + (1, 6, 1.2521560192108154, 3.0256359577178955), + (1, 10, 1.2134110927581787, 2.9174180030822754), + (2, 1, 1.657214879989624, 4.5261678695678711), + (2, 2, 1.2123160362243652, 2.9747390747070312), + (2, 3, 1.0758891105651855, 2.5690360069274902), + (2, 4, 1.0683329105377197, 2.4233770370483398), + (2, 5, 1.0955719947814941, 2.3999948501586914), + (2, 6, 1.5935721397399902, 2.6878271102905273), + (2, 10, 1.8511250019073486, 3.2417428493499756), + (3, 1, 1.5948119163513184, 3.631148099899292), + (3, 2, 1.0761330127716064, 2.6011371612548828), + (3, 3, 1.0551531314849854, 2.4200370311737061), + (3, 4, 1.3930759429931641, 2.5211219787597656), + (3, 5, 1.4330689907073975, 2.5704989433288574), + (3, 6, 1.362138032913208, 2.5964410305023193), + (3, 10, 1.6582000255584717, 2.9907989501953125), + (4, 1, 1.4793620109558105, 3.3473429679870605), + (4, 2, 1.0671560764312744, 2.4171769618988037), + (4, 3, 1.2569692134857178, 2.2807950973510742), + (4, 4, 1.3456289768218994, 2.6219108104705811), + (4, 5, 1.4055080413818359, 2.4606490135192871), + (4, 6, 1.372107982635498, 2.551663875579834), + (4, 10, 1.599470853805542, 2.9172940254211426), + (5, 1, 1.4115700721740723, 3.2077109813690186), + (5, 2, 1.0635769367218018, 2.2648060321807861), + (5, 3, 1.3842809200286865, 2.6135518550872803), + (5, 4, 1.3470511436462402, 2.3852400779724121), + (5, 5, 1.3539440631866455, 2.5245928764343262), + (5, 6, 1.4037849903106689, 2.5985310077667236), + (5, 10, 1.6120610237121582, 2.8127608299255371), + (6, 1, 1.3623628616333008, 3.021122932434082), + (6, 2, 1.1697649955749512, 2.6285450458526611), + (6, 3, 1.2980999946594238, 2.4746189117431641), + (6, 4, 1.3739941120147705, 2.5579929351806641), + (6, 5, 1.3967819213867188, 2.5522029399871826), + (6, 6, 1.4279270172119141, 2.6127138137817383), + (6, 10, 1.605496883392334, 2.864037036895752), + (10, 1, 1.6401121616363525, 2.970099925994873), + (10, 2, 1.46710205078125, 2.7231831550598145), + (10, 3, 1.4193780422210693, 2.6087639331817627), + (10, 4, 1.4657118320465088, 2.6246678829193115), + (10, 5, 1.5052611827850342, 2.6542458534240723), + (10, 6, 1.5214400291442871, 2.7243161201477051), + (10, 10, 1.6116268634796143, 2.956165075302124), + ] + + # valid time, full time + speed_unroll_patch_noshape = [2.0109100341796875, 5.8175678253173828] + # valid time, full time + speed_unroll_patch_shape = [1.2967290878295898, 5.5283889770507812] + + @staticmethod + def has_all_shape(imshp, kshp, nkern=1, bsize=1): + return ( + nkern is not None + and bsize is not None + and all(shp is not None for shp in imshp) + and all(shp is not None for shp in kshp) + ) + + @staticmethod + def getOutputShape(inshp, kshp, stride=(1, 1), mode="valid"): + """ + Computes the output dimensions of convolving an image of shape "inshp" + with kernels of shape "kshp". Accepts symbolic or integer shapes. + Propagates `None`s (for unknown shapes). + + Parameters + ---------- + inshp + (rows,cols) of input image. + kshp + (rows,cols) of filters. + mode: {'valid', 'full'} + See 'border_mode' in conv2d's doc. + + Returns + ------- + object + (rows,cols) of output image. + + """ + # The formula would be ceil((i + s * k - s * 1) / float(d)), + # with s=1 for mode=='full' and s=-1 for mode=='valid'. + # To support symbolic shapes, we express this with integer arithmetic. + warnings.warn( + "`getOutputShape` is deprecated; use `get_conv_output_shape` instead.", + DeprecationWarning, + stacklevel=2, + ) + return tuple( + get_conv_shape_1axis(i, k, mode, d) for i, k, d in zip(inshp, kshp, stride) + ) + + def __init__( + self, + imshp=None, + kshp=None, + nkern=None, + bsize=None, + dx=1, + dy=1, + output_mode="valid", + unroll_batch=None, + unroll_kern=None, + unroll_patch=None, + imshp_logical=None, + kshp_logical=None, + kshp_logical_top_aligned=True, + verbose=False, + openmp=None, + ): + # Expand unknown image / kernel shapes into tuples of Nones + if imshp is None: + imshp = (None, None, None) + else: + imshp = tuple(imshp) + if kshp is None: + kshp = (None, None) + else: + kshp = tuple(kshp) + + # Check imshp and kshp dimensionality + if len(imshp) == 2: + imshp = (1,) + imshp + elif len(imshp) != 3: + raise ValueError(f"len(imshp) must be 2 or 3, got {len(imshp)}") + if len(kshp) != 2: + raise ValueError(f"len(kshp) must be 2, got {len(kshp)}") + + # We must continue to consider None as 1 for backward compatibility. + if dx is None: + dx = 1 + if dy is None: + dy = 1 + + if int(dx) != dx: + raise TypeError("ConvOp.__init__ param dx must be an int", dx) + dx = int(dx) + + if int(dy) != dy: + raise TypeError("ConvOp.__init__ param dy must be an int", dy) + dy = int(dy) + + all_shape = self.has_all_shape(imshp, kshp, nkern, bsize) + if (unroll_batch or unroll_kern) and not all_shape: + raise ValueError( + "In ConvOp, when using unroll_batch and" + " unroll_nkern, all shape are needed" + ) + + # Init the openmp attribute + super().__init__(openmp=openmp) + if not all_shape or self.openmp: + # Only this version is parallelized + unroll_patch = True + self.verbose = verbose + self.imshp = imshp + self.kshp = kshp + self.nkern = nkern + self.bsize = bsize + self.dx = dx + self.dy = dy + + # a triple + if imshp_logical is None: + self.imshp_logical = self.imshp + else: + imshp_logical = tuple(imshp_logical) + if len(imshp_logical) != 3: + raise ValueError( + f"len(imshp_logical) must be 3, got {len(imshp_logical)}" + ) + self.imshp_logical = imshp_logical + + # a pair + if kshp_logical is None: + self.kshp_logical = self.kshp + else: + kshp_logical = tuple(kshp_logical) + if len(kshp_logical) != 2: + raise ValueError( + f"len(kshp_logical) must be 2, got {len(kshp_logical)}" + ) + self.kshp_logical = kshp_logical + + # a bool + self.kshp_logical_top_aligned = kshp_logical_top_aligned + + self.unroll_batch = unroll_batch + self.unroll_kern = unroll_kern + self.unroll_patch = unroll_patch + + if self.unroll_batch and not self.unroll_kern: + self.unroll_kern = 1 + if self.unroll_kern and not self.unroll_batch: + self.unroll_batch = 1 + + # downcast unroll_batch if not a divisor of batch size + if ( + self.unroll_batch is not None + and self.unroll_batch > 0 + and self.bsize % self.unroll_batch != 0 + ): + if self.bsize <= self.unroll_batch: + self.unroll_batch = self.bsize + else: + # find the maximum value under unroll_batch that would work + new = self.unroll_batch + assert new >= 1 + while self.bsize % new != 0: + new -= 1 + + warnstr = ( + "In ConvOp.__init__(): " + f"unroll_batch({self.unroll_batch}) must be 0 or a divisor of" + f" bsize({self.bsize}). We revert it to {new}. This" + " won't change the result, but may make it slower." + ) + _logger.warning(warnstr) + + self.unroll_batch = new + + # downcast unroll_kern if not a divisor of nb of kernel + if ( + self.unroll_kern is not None + and self.unroll_kern > 0 + and self.nkern % self.unroll_kern != 0 + ): + if self.nkern <= self.unroll_kern: + self.unroll_kern = self.nkern + else: + # find the maximum value under unroll_kern that would work + new = self.unroll_kern + assert new >= 1 + while self.nkern % new != 0: + new -= 1 + + warnstr = ( + "In ConvOp.__init__(): " + f"unroll_kern({self.unroll_kern}) must be 0 or a divisor of" + f" nkern({self.nkern}). We revert it to {new}. This" + " won't change the result, but may make it slower." + ) + _logger.warning(warnstr) + self.unroll_kern = new + + self.outshp = get_conv_output_shape( + (None,) + self.imshp_logical, + ( + None, + None, + ) + + self.kshp_logical, + output_mode, + (dx, dy), + )[2:] + self.fulloutshp = get_conv_output_shape( + (None,) + self.imshp_logical, + ( + None, + None, + ) + + self.kshp_logical, + output_mode, + (1, 1), + )[2:] + + self.out_mode = output_mode + + if self.out_mode not in ("valid", "full"): + raise NotImplementedError(f"Mode {self.out_mode} not implemented") + + if any((shp is not None) and (shp <= 0) for shp in self.outshp): + raise ValueError( + "Bad size for the output shape. Verify that [post-" + f"supersampling] input shape ({self.imshp_logical}) and kern" + f" shape({self.kshp_logical}) are ok. (Hint: kerns must fit inside" + " image in valid mode)" + ) + + if ( + self.unroll_kern is None + and self.unroll_batch is None + and self.unroll_patch is None + ): + # no version specified. Find the faster we have + if self.bsize is None and self.nkern is None: + self.unroll_patch = True + elif self.bsize is not None and self.nkern is not None: + bsize = self.bsize + nkern = self.nkern + mode_idx = 0 + if self.out_mode != "valid": + mode_idx = 1 + if self.has_all_shape(self.imshp, self.kshp): + time_unroll_patch = self.speed_unroll_patch_shape[mode_idx] + else: + time_unroll_patch = self.speed_unroll_patch_noshape[mode_idx] + time_unroll_batch_kern = 9999999 + for i in range(len(self.speed_unroll_batch_kern)): + if ( + bsize % self.speed_unroll_batch_kern[i][0] == 0 + and nkern % self.speed_unroll_batch_kern[i][1] == 0 + ): + if ( + self.speed_unroll_batch_kern[i][2 + mode_idx] + < time_unroll_batch_kern + ): + time_unroll_batch_kern = self.speed_unroll_batch_kern[i][ + 2 + mode_idx + ] + time_unroll_batch_kern_idx = i + if time_unroll_patch < time_unroll_batch_kern: + self.unroll_patch = True + else: + self.unroll_batch = self.speed_unroll_batch_kern[ + time_unroll_batch_kern_idx + ][0] + self.unroll_kern = self.speed_unroll_batch_kern[ + time_unroll_batch_kern_idx + ][1] + self.unroll_patch = False + + _logger.debug( + "AUTO FIND VERSION OF C_CODE OF CONV OP %s %s %s %s %s %s %s", + self.unroll_batch, + self.unroll_kern, + self.unroll_patch, + self.bsize, + self.nkern, + time_unroll_patch, + time_unroll_batch_kern, + ) + + self._rehash() + + def __eq__(self, other): + if type(self) != type(other): + return False + for a in self.__attrnames: + if getattr(self, a) != getattr(other, a): + return False + return True + + def __setstate__(self, d): + super().__setstate__(d) + self._rehash() + + def _rehash(self): + hashval = hash(type(self)) + for a in self.__attrnames: + hashval = hashval ^ hash(getattr(self, a)) + self.__hashval = hashval + + def __hash__(self): + return self.__hashval + + def __str__(self): + return ( + "ConvOp{" + + ",".join(str((a, getattr(self, a))) for a in self.__attrnames) + + "}" + ) + + def flops(self, inputs, outputs): + """ + Useful with the hack in profiling to print the MFlops. + + """ + images, kerns = inputs + (out,) = outputs + assert images[1] == kerns[1] + flops = 0 + if self.out_mode == "valid": + # nb mul and add by output pixel + flops = kerns[2] * kerns[3] * 2 + # nb flops by output image + flops *= out[2] * out[3] + # nb patch multiplied + flops *= images[1] * kerns[0] * images[0] + else: + flops = ( + images[0] + * kerns[0] + * images[1] + * kerns[2] + * kerns[3] + * images[2] + * images[3] + * 2 + ) + return flops + + def make_node(self, inputs, kerns): + # TODO: find a way to make ConvOp work for N-D (after NIPS09) + """ + Parameters + ---------- + inputs + 4 dim: batches x stacksize x rows x cols. + kerns + 4 dim: nkern x stackidx x rows x cols. + + """ + _inputs = as_tensor_variable(inputs) + _kerns = as_tensor_variable(kerns) + # TODO: lift this restriction by upcasting either inputs or kerns + if _inputs.ndim != 4: + raise TypeError( + "ConvOp (make_node) requires input be a 4D tensor;" + f' received "{inputs}" ({_inputs.ndim} dims)' + ) + if _kerns.ndim != 4: + raise TypeError("make_node requires 4D tensor of kernels") + if _inputs.type.dtype != _kerns.type.dtype: + raise NotImplementedError( + "The image and the kernel must have the same type." + "inputs({_inputs.dtype}), kerns({_kerns.dtype})" + ) + out_shape = ( + _inputs.type.shape[0], + _kerns.type.shape[0], + self.outshp[0], + self.outshp[1], + ) + out_shape = tuple(1 if s == 1 else None for s in out_shape) + output = tensor( + dtype=_inputs.type.dtype, + shape=out_shape, + ) + + return Apply(self, [_inputs, _kerns], [output]) + + def infer_shape(self, fgraph, node, input_shapes): + imshp = input_shapes[0] # 4D image shape + kshp = input_shapes[1] # 4D filter shape + bsize, imshp = imshp[0], list(imshp[1:]) + nkern, kshp = kshp[0], list(kshp[2:]) + # replace symbolic shapes with known shapes + if self.bsize is not None: + bsize = self.bsize + for i in (0, 1, 2): + if self.imshp_logical[i] is not None: + imshp[i] = self.imshp_logical[i] + if self.nkern is not None: + nkern = self.nkern + for i in (0, 1): + if self.kshp_logical[i] is not None: + kshp[i] = self.kshp_logical[i] + # infer output shape from what we have + res = get_conv_output_shape( + (bsize,) + tuple(imshp), + ( + nkern, + None, + ) + + tuple(kshp), + self.out_mode, + (self.dx, self.dy), + ) + return [res] + + def perform(self, node, inp, out): + """ + By default if len(img2d.shape)==3, we TODO + + """ + img2d, filtersflipped = inp + (z,) = out + + # TODO: move these back out to global scope when they no longer + # cause an atexit error + imshp = self.imshp + if any(x is None for x in imshp): + imshp = tuple(img2d.shape[1:]) + if imshp != img2d.shape[1:]: + raise ValueError( + "The image shape provided at build time " + "is different from the one passed at run time", + imshp, + img2d.shape[1:], + ) + kshp = self.kshp + if any(x is None for x in kshp): + kshp = tuple(filtersflipped.shape[2:]) + if kshp != filtersflipped.shape[2:]: + raise ValueError( + "The filter shape provided at build time " + "is different from the one passed at run time", + kshp, + filtersflipped.shape[2:], + ) + bsize = self.bsize + if bsize is None: + bsize = img2d.shape[0] + elif bsize != img2d.shape[0]: + raise ValueError( + "The batch size provided at build time " + "is different from the one passed at run time", + bsize, + img2d.shape[0], + ) + nkern = self.nkern + if nkern is None: + nkern = filtersflipped.shape[0] + elif nkern != filtersflipped.shape[0]: + raise ValueError( + "The number of filters provided at build time " + "is different from the one passed at run time", + nkern, + filtersflipped.shape[0], + ) + + imshp_logical = self.imshp_logical + if imshp_logical[0] is None: + imshp_logical = (imshp[0],) + imshp_logical[1:] + if imshp_logical[1] is None: + imshp_logical = (imshp_logical[0], imshp[1], imshp_logical[2]) + if imshp_logical[2] is None: + imshp_logical = imshp_logical[:2] + (imshp[2],) + assert all(x is not None for x in imshp_logical) + + kshp_logical = self.kshp_logical + if kshp_logical[0] is None: + kshp_logical = (kshp[0], kshp_logical[1]) + if kshp_logical[1] is None: + kshp_logical = (kshp_logical[0], kshp[1]) + assert all(x is not None for x in kshp_logical) + + if all(shp is not None for shp in self.fulloutshp): + fulloutshp = tuple(self.fulloutshp) + else: + fulloutshp = get_conv_output_shape( + (None,) + imshp_logical, + ( + None, + None, + ) + + kshp_logical, + self.out_mode, + (1, 1), + )[2:] + + if ( + z[0] is None + or z[0].shape + != ( + bsize, + nkern, + ) + + fulloutshp + ): + z[0] = np.zeros( + ( + bsize, + nkern, + ) + + fulloutshp, + dtype=img2d.dtype, + ) + zz = z[0] + + stacklen = imshp[0] + + img2d = img2d.reshape((bsize,) + imshp) + filtersflipped = filtersflipped.reshape((nkern, stacklen) + kshp) + + if self.imshp != self.imshp_logical: + # assuming that to get from imshp to imshp logical we insert zeros in missing spots + rstride = int(np.ceil(imshp_logical[1] / float(imshp[1]))) + cstride = int(np.ceil(imshp_logical[2] / float(imshp[2]))) + buf = np.zeros((bsize,) + imshp_logical, dtype=img2d.dtype) + buf[:, :, ::rstride, ::cstride] = img2d + img2d = buf + del buf, rstride, cstride + + if kshp != kshp_logical: + rstride = int(np.ceil(kshp_logical[0] / float(kshp[0]))) + cstride = int(np.ceil(kshp_logical[1] / float(kshp[1]))) + buf = np.zeros( + (nkern, stacklen) + self.kshp_logical, dtype=filtersflipped.dtype + ) + if self.kshp_logical_top_aligned: + roffset = coffset = 0 + else: + roffset = ( + kshp_logical[0] - (kshp[0] * rstride) - 1 + rstride + ) % rstride + coffset = ( + kshp_logical[1] - (kshp[1] * cstride) - 1 + cstride + ) % cstride + assert roffset >= 0 + assert coffset >= 0 + buf[:, :, roffset::rstride, coffset::cstride] = filtersflipped + filtersflipped = buf + del buf, rstride, cstride + + val = _valfrommode(self.out_mode) + bval = _bvalfromboundary("fill") + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", np.ComplexWarning) + for b in range(bsize): + for n in range(nkern): + zz[b, n, ...].fill(0) + for im0 in range(stacklen): + # some cast generates a warning here + zz[b, n, ...] += _convolve2d( + img2d[b, im0, ...], + filtersflipped[n, im0, ...], + 1, + val, + bval, + 0, + ) + + if False: + if False and self.out_mode == "full": + img2d2 = np.zeros( + ( + bsize, + stacklen, + imshp[1] + 2 * kshp[0] - 2, + imshp[2] + 2 * kshp[1] - 2, + ) + ) + img2d2[ + :, + :, + kshp[0] - 1 : kshp[0] - 1 + imshp[1], + kshp[1] - 1 : kshp[1] - 1 + imshp[2], + ] = img2d + img2d = img2d2 + # N_image_shape = image_data.shape + + for b in range(bsize): + for n in range(nkern): + zz[b, n, ...].fill(0) + for im0 in range(stacklen): + for row in range(0, zz.shape[2], self.dx): + for col in range(0, zz.shape[3], self.dy): + zz[b, n, row, col] += ( + img2d[ + b, im0, row : row + kshp[0], col : col + kshp[1] + ] + * filtersflipped[n, im0, ::-1, ::-1] + ).sum() + + # We copy it to remove the Stride mismatch warning from DEBUG_MODE. + # The copy make that we return an object with the same stride as the c version. + # The copy don't affect the performance during our experience as in that case we + # execute the c version which is much faster. + if self.dx > 1 or self.dy > 1: + zz = zz[:, :, 0 :: self.dx, 0 :: self.dy].copy() + z[0] = zz + + def R_op(self, inputs, eval_points): + rval = None + if eval_points[0] is not None: + rval = self.make_node(eval_points[0], inputs[1]).outputs[0] + if eval_points[1] is not None: + if rval is None: + rval = self.make_node(inputs[0], eval_points[1]).outputs[0] + else: + rval += self.make_node(inputs[0], eval_points[1]).outputs[0] + return [rval] + + def grad(self, inp, grads): + inputs, kerns = inp + (gz,) = grads + + if self.imshp != self.imshp_logical or self.kshp != self.kshp_logical: + raise NotImplementedError("todo") + + if self.out_mode == "valid" and (self.dx, self.dy) != (1, 1): + raise NotImplementedError( + "ERROR: ConvOp.grad is now disabled for 'valid' convolutions with" + " stride != (1, 1); call pytensor.tensor.conv.conv2d() instead." + ) + + if self.dx not in (1, 2) or self.dy not in (1, 2): + raise NotImplementedError( + "ERROR: We disable ConvOp.grad now when output_mode is not" + " 'valid' and dx or dy are greater than 2, as there is a bug" + " in it. See `abstract_conv2d <>`_ for a version that support this." + ) + + all_shape = self.has_all_shape(self.imshp, self.kshp, self.nkern, self.bsize) + + if not all_shape and (self.dx != 1 or self.dy != 1): + raise ValueError( + "ConvOp.grad when dx!=1 or dy!=1 we must have all " + "the optional shape information" + ) + + # Determine gradient on kernels ######## + assert inputs.ndim == 4 and kerns.ndim == 4 + + newin = inputs.dimshuffle((1, 0, 2, 3)) + newgz = gz.dimshuffle((1, 0, 2, 3)) + + if self.out_mode == "valid": + (img, filters) = (newin, newgz) + kshp_logical = self.fulloutshp + kshp_logical_top_aligned = False + imshp_logical = None + (bsize, nkern) = (self.imshp[0], self.nkern) + imshp = (self.bsize, self.imshp[1], self.imshp[2]) + kshp = self.outshp + elif self.out_mode == "full": + (img, filters) = (newgz, newin) + kshp_logical = None + kshp_logical_top_aligned = True + imshp_logical = (self.bsize, self.fulloutshp[0], self.fulloutshp[1]) + (bsize, nkern) = (self.nkern, self.imshp[0]) + imshp = (self.bsize, self.outshp[0], self.outshp[1]) + kshp = self.imshp[1:] + else: + raise NotImplementedError( + "Only [full,valid] modes are currently supported." + ) + + filters = filters[:, :, ::-1, ::-1] # flip them + + dw = ConvOp( + imshp, + kshp, + nkern, + bsize, + 1, + 1, + output_mode="valid", + unroll_batch=None, + unroll_kern=None, + unroll_patch=None, + imshp_logical=imshp_logical, + kshp_logical=kshp_logical, + kshp_logical_top_aligned=kshp_logical_top_aligned, + verbose=self.verbose, + ) + + dw = dw(img, filters) + + if all_shape: + assert all(o == k for o, k in zip(dw.owner.op.outshp, self.kshp)) + if self.out_mode == "valid": + # before DimShuffle, dw is of shape visdim x nkern x kshp[0] x kshp[1] + dw = dw.dimshuffle((1, 0, 2, 3)) + dw = dw[:, :, ::-1, ::-1] + + # Determine gradient on inputs ######## + mode = "valid" + if self.out_mode != "full": + mode = "full" + + filters = kerns.dimshuffle((1, 0, 2, 3)) + filters = filters[:, :, ::-1, ::-1] + + nkern = self.imshp[0] + imshp = (self.nkern, self.outshp[0], self.outshp[1]) + imshp_logical = (self.nkern, self.fulloutshp[0], self.fulloutshp[1]) + + din = ConvOp( + imshp, + self.kshp, + nkern, + self.bsize, + 1, + 1, + output_mode=mode, + unroll_batch=None, + unroll_kern=None, + unroll_patch=None, + imshp_logical=imshp_logical, + kshp_logical=None, + verbose=self.verbose, + ) + + din = din(gz, filters) + + assert all( + o is None or o == i for o, i in zip(din.owner.op.outshp, self.imshp[1:]) + ) + + # din and dw should have the same broadcasting pattern as the + # parameters they are the gradient of (resp. inputs and kerns). + if din.type.broadcastable != inputs.type.broadcastable: + din = specify_broadcastable( + din, *(ax for (ax, b) in enumerate(inputs.type.broadcastable) if b) + ) + if dw.type.broadcastable != kerns.type.broadcastable: + dw = specify_broadcastable( + dw, *(ax for (ax, b) in enumerate(kerns.type.broadcastable) if b) + ) + return [din, dw] + + def c_headers(self, **kwargs): + return ["", "", ""] + + def c_code_cache_version(self): + return (15, self.openmp, blas.blas_header_version()) + + def c_support_code(self, **kwargs): + return ( + """ +#define STRIDES(arr) (PyArray_STRIDES(arr)) +#define FULL 2 +#define SAME 1 +#define VALID 0 +#define MOD % +using namespace std; +""" + + blas.blas_header_text() + ) + + def use_blas(self): + """Return True if we will generate code that use gemm.""" + # the gemm version only support that case + if self.out_mode == "valid" and self.dx == 0 and self.dy == 0: + # We use a faster version in those case. + if ( + self.imshp != self.imshp_logical + or self.kshp != self.kshp_logical + or self.unroll_patch + or self.unroll_batch > 0 + or self.unroll_kern > 0 + ): + return False + return True + return False + + def c_libraries(self, **kwargs): + if self.use_blas(): + return blas.ldflags() + return [] + + def c_no_compile_args(self, **kwargs): + # when the ksph==(1,1) gcc 4.3.0 segfault during the + # compilation with -O3. This don't happen at -O2 + if pytensor.link.c.cmodule.gcc_version() in ["4.3.0"] and self.kshp == (1, 1): + return ["-O3"] + else: + return [] + + def c_compile_args(self, **kwargs): + ret = [] + + if self.use_blas(): + ret = blas.ldflags(libs=False, flags=True) + if pytensor.link.c.cmodule.gcc_version() in ["4.3.0"] and self.kshp == (1, 1): + ret += ["-O2"] + # Add the -fopenmp flags + ret += super().c_compile_args(**kwargs) + + return ret + + def c_lib_dirs(self, **kwargs): + if self.use_blas(): + return blas.ldflags(libs=False, libs_dir=True) + return [] + + def c_header_dirs(self, **kwargs): + if self.use_blas(): + return blas.ldflags(libs=False, include_dir=True) + return [] + + def c_code(self, node, name, inp, out, sub): + img2d, filtersflipped = inp + (z,) = out + if node.inputs[0].type.dtype != node.inputs[1].type.dtype: + raise NotImplementedError() + assert node.inputs[0].type.dtype == node.inputs[1].type.dtype + d = locals() + d.update(sub) + + all_shape = self.has_all_shape( + self.imshp, self.kshp, self.nkern, self.bsize + ) and self.has_all_shape(self.imshp_logical, self.kshp_logical) + + d["self_out_mode"] = self.out_mode + d["self_dx"] = self.dx + d["self_dy"] = self.dy + d["mode"] = self.out_mode.upper() + d["affectation"] = "=" + + # Default values, will be overridden if the shape info is provided + d["self_bsize"] = f"PyArray_DIMS({d['img2d']})[0]" + d["self_nkern"] = f"PyArray_DIMS({d['filtersflipped']})[0]" + d["self_outshp0"] = "-1" + d["self_outshp1"] = "-1" + d["self_imshp0"] = f"PyArray_DIMS({d['img2d']})[1]" + d["self_imshp1"] = f"PyArray_DIMS({d['img2d']})[2]" + d["self_imshp2"] = f"PyArray_DIMS({d['img2d']})[3]" + d["self_kshp0"] = f"PyArray_DIMS({d['filtersflipped']})[2]" + d["self_kshp1"] = f"PyArray_DIMS({d['filtersflipped']})[3]" + d["assert_size"] = "" + + # Override the default value if we have it + if self.kshp[0] is not None: + expected = d["self_kshp0"] + value = self.kshp[0] + d[ + "assert_size" + ] += """ +if(%(value)s != %(expected)s){ + PyErr_Format(PyExc_ValueError, + "The hardcoded shape for the number of rows in the filter " + "(%%ld) isn't the run time shape (%%ld).", + (long)%(value)s, (long)%(expected)s); + %(fail)s; +} + """ % dict( + expected=expected, value=value, **sub + ) + d["self_kshp0"] = self.kshp[0] + if self.kshp[1] is not None: + expected = d["self_kshp1"] + value = self.kshp[1] + d[ + "assert_size" + ] += """ +if(%(value)s != %(expected)s){ + PyErr_Format(PyExc_ValueError, + "The hardcoded shape for the number of columns in the filter " + "(%%ld) isn't the run time shape (%%ld).", + (long)%(value)s, (long)%(expected)s); + %(fail)s; +} + """ % dict( + expected=expected, value=value, **sub + ) + d["self_kshp1"] = self.kshp[1] + if self.outshp[0] is not None: + expected = "dim_zz[0]" + value = self.outshp[0] + d[ + "assert_size" + ] += """ +if(%(value)s != %(expected)s){ + PyErr_Format(PyExc_ValueError, + "The hardcoded shape for the number of rows in the output " + "(%%ld) isn't the run time shape (%%ld).", + (long)%(value)s, (long)%(expected)s); + %(fail)s; +} + """ % dict( + expected=expected, value=value, **sub + ) + d["self_outshp0"] = self.outshp[0] + if self.outshp[1] is not None: + expected = "dim_zz[1]" + value = self.outshp[1] + d[ + "assert_size" + ] += """ +if(%(value)s != %(expected)s){ + PyErr_Format(PyExc_ValueError, + "The hardcoded shape for the number of columns in the output " + "(%%ld) isn't the run time shape (%%ld).", + (long)%(value)s, (long)%(expected)s); + %(fail)s; +} + """ % dict( + expected=expected, value=value, **sub + ) + d["self_outshp1"] = self.outshp[1] + if self.imshp[0] is not None: + expected = d["self_imshp0"] + value = self.imshp[0] + d[ + "assert_size" + ] += """ +if(%(value)s != %(expected)s){ + PyErr_Format(PyExc_ValueError, + "The hardcoded shape for the image stack size (%%ld) " + "isn't the run time shape (%%ld).", + (long)%(value)s, (long)%(expected)s); + %(fail)s; +} + """ % dict( + expected=expected, value=value, **sub + ) + expected = "kerns_dim[1]" + value = self.imshp[0] + d[ + "assert_size" + ] += """ +if(%(value)s != %(expected)s){ + PyErr_Format(PyExc_ValueError, + "The hardcoded shape for the kernel stack size (%%ld) " + "isn't the run time shape (%%ld).", + (long)%(value)s, (long)%(expected)s); + %(fail)s; +} + """ % dict( + expected=expected, value=value, **sub + ) + d["self_imshp0"] = self.imshp[0] + if self.imshp[1] is not None: + expected = d["self_imshp1"] + value = self.imshp[1] + d[ + "assert_size" + ] += """ +if(%(value)s != %(expected)s){ + PyErr_Format(PyExc_ValueError, + "The hardcoded shape for the number of rows in the image " + "(%%ld) isn't the run time shape (%%ld).", + (long)%(value)s, (long)%(expected)s); + %(fail)s; +} + """ % dict( + expected=expected, value=value, **sub + ) + d["self_imshp1"] = self.imshp[1] + if self.imshp[2] is not None: + expected = d["self_imshp2"] + value = self.imshp[2] + d[ + "assert_size" + ] += """ +if(%(value)s != %(expected)s){ + PyErr_Format(PyExc_ValueError, + "The hardcoded shape for the number of columns in the image " + "(%%ld) isn't the run time shape (%%ld).", + (long)%(value)s, (long)%(expected)s); + %(fail)s; +} + """ % dict( + expected=expected, value=value, **sub + ) + d["self_imshp2"] = self.imshp[2] + if self.bsize is not None: + expected = d["self_bsize"] + value = self.bsize + d[ + "assert_size" + ] += """ +if(%(value)s != %(expected)s){ + PyErr_Format(PyExc_ValueError, + "The hardcoded shape for the batch size (%%ld) " + "isn't the run time shape (%%ld).", + (long)%(value)s, (long)%(expected)s); + %(fail)s; +} + """ % dict( + expected=expected, value=value, **sub + ) + d["self_bsize"] = self.bsize + if self.nkern is not None: + expected = d["self_nkern"] + value = self.nkern + d[ + "assert_size" + ] += """ +if(%(value)s != %(expected)s){ + PyErr_Format(PyExc_ValueError, + "The hardcoded shape for the number of kernels in the filter " + "(%%ld) isn't the run time shape (%%ld).", + (long)%(value)s, (long)%(expected)s); + %(fail)s; +} + """ % dict( + expected=expected, value=value, **sub + ) + d["self_nkern"] = self.nkern + + # Other hard coded stuff only if we have all shapes + if all_shape: + d["self_kshp_logical_r"] = self.kshp_logical[0] + d["self_kshp_logical_c"] = self.kshp_logical[1] + d["self_kshp_logical_stride_r"] = int( + np.ceil(self.kshp_logical[0] / float(self.kshp[0])) + ) + d["self_kshp_logical_stride_c"] = int( + np.ceil(self.kshp_logical[1] / float(self.kshp[1])) + ) + d["self_imshp_logical_r"] = self.imshp_logical[1] + # numpy.B. 1 not 0 + d["self_imshp_logical_c"] = self.imshp_logical[2] + # numpy.B. 2 not 1 + d["self_imshp_logical_stride_r"] = int( + np.ceil(self.imshp_logical[1] / float(self.imshp[1])) + ) + d["self_imshp_logical_stride_c"] = int( + np.ceil(self.imshp_logical[2] / float(self.imshp[2])) + ) + if self.imshp[0] != 1: + d["affectation"] = "+=" + d["all_shape"] = "1" + d["dim_zz_const"] = "const" + d["dim_zz_affect"] = "" + else: + d["affectation"] = "+=" + d["all_shape"] = "0" + d["dim_zz_const"] = "" + d["dim_zz_affect"] = ( + """ + if (mode == FULL) { + dim_zz[0] = (int)ceil((dim_im[0]+dim_ker0-1)/float(%(self_dx)s)); + dim_zz[1] = (int)ceil((dim_im[1]+dim_ker1-1)/float(%(self_dy)s)); + } else { + dim_zz[0] = (int)ceil((dim_im[0]-dim_ker0+1)/float(%(self_dx)s)); + dim_zz[1] = (int)ceil((dim_im[1]-dim_ker1+1)/float(%(self_dy)s)); + } +""" + % d + ) + d["assert_size"] += ( + """ +// Check the stack size of the filter and images are equals +if(kerns_dim[1] != img2d_dim[1]){ + PyErr_Format(PyExc_ValueError, + "the filter stack size (%%ld) and image stack size (%%ld) differ", + (long)kerns_dim[1], (long)img2d_dim[1]); + %(fail)s; +} + """ + % sub + ) + + if self.kshp_logical_top_aligned: + d["self_kshp_logical_offset_r"] = 0 + d["self_kshp_logical_offset_c"] = 0 + elif all_shape: + rstride = d["self_kshp_logical_stride_r"] + cstride = d["self_kshp_logical_stride_c"] + d["self_kshp_logical_offset_r"] = ( + self.kshp_logical[0] - (self.kshp[0] * rstride) - 1 + rstride + ) % rstride + d["self_kshp_logical_offset_c"] = ( + self.kshp_logical[1] - (self.kshp[1] * cstride) - 1 + cstride + ) % cstride + del rstride, cstride + + if node.inputs[0].type.dtype == "float32": + d["type"] = "float" + elif node.inputs[0].type.dtype == "float64": + d["type"] = "double" + else: + raise NotImplementedError( + f"Type {node.inputs[0].type.dtype} not implemented" + ) + d["gemm"] = "dgemm_" + if d["type"] != "double": + d["gemm"] = "sgemm_" + + if self.imshp != self.imshp_logical or self.kshp != self.kshp_logical: + if self.verbose: + _logger.debug( + "return imshp!=imshp_logical or" + " self.kshp != self.kshp_logical shape version" + ) + return _conv_op_code_a % d + + if self.unroll_patch: + if self.verbose: + _logger.debug("return unroll patch version. all_shape=%s", all_shape) + return _conv_op_code_unroll_patch % d + if (self.unroll_batch is not None and self.unroll_batch > 0) or ( + self.unroll_kern is not None and self.unroll_kern > 0 + ): + assert self.unroll_batch > 0 + assert self.unroll_kern > 0 + if self.verbose: + _logger.debug( + "return unrolled batch (%s) and kern code (%s)", + str(self.unroll_batch), + str(self.unroll_kern), + ) + return gen_conv_code_unroll_batch_kern( + d, self.unroll_batch, self.unroll_kern + ) + + # TODO: should we choose the unroll size automatically with the bigger divisor under 5? + if self.out_mode == "valid" and self.dx == 0 and self.dy == 0: + if self.verbose: + _logger.debug("return gemm version") + return _conv_op_code_valid_gemm % d + else: + if self.verbose: + _logger.debug("return no gemm version") + return _conv_op_code_a % d + + +_conv_op_code_a = """ +const int mode=%(mode)s; +int typenum=0, typenum_f=0; +PyArrayObject *ain1=NULL, *ain2=NULL; +PyArrayObject *filtersflipped_arr=NULL, *img2d_arr=NULL, *z_arr=NULL; +const %(type)s fill_value = 0; + +int type_im=PyArray_TYPE(%(img2d)s); +int type_ker=PyArray_TYPE(%(filtersflipped)s); + +npy_intp dim_zz[2]={%(self_outshp0)s,%(self_outshp1)s}; +npy_intp dim_im_phys[2]={%(self_imshp1)s,%(self_imshp2)s}; +npy_intp dim_im_log[2]={%(self_imshp_logical_r)s,%(self_imshp_logical_c)s}; +npy_intp dim_ker_phys[2]={%(self_kshp0)s,%(self_kshp1)s}; +npy_intp dim_ker_log[2]={%(self_kshp_logical_r)s,%(self_kshp_logical_c)s}; + +PyArray_Dims img2d_shape; +npy_intp img2d_dim[4]={1,1,0,0}; +img2d_shape.ptr=img2d_dim; +img2d_shape.len=4; + +PyArray_Dims kerns_shape; +npy_intp kerns_dim[4]={1,1,0,0}; +kerns_shape.ptr=kerns_dim; +kerns_shape.len=4; +PyObject *img2d=NULL, *contig, *filtersflipped=NULL; + + +if(PyArray_NDIM(%(img2d)s)==2){ + img2d_dim[3]=PyArray_DIMS(%(img2d)s)[1]; + img2d_dim[2]=PyArray_DIMS(%(img2d)s)[0]; +}else if(PyArray_NDIM(%(img2d)s)==3){ + img2d_dim[3]=PyArray_DIMS(%(img2d)s)[2]; + img2d_dim[2]=PyArray_DIMS(%(img2d)s)[1]; + img2d_dim[0]=PyArray_DIMS(%(img2d)s)[0]; +}else if(PyArray_NDIM(%(img2d)s)==4){ + img2d_dim[3]=PyArray_DIMS(%(img2d)s)[3]; + img2d_dim[2]=PyArray_DIMS(%(img2d)s)[2]; + img2d_dim[1]=PyArray_DIMS(%(img2d)s)[1]; + img2d_dim[0]=PyArray_DIMS(%(img2d)s)[0]; +}else { + PyErr_SetString(PyExc_ValueError, "img don't have a good shape"); + %(fail)s; +} + +if(PyArray_NDIM(%(filtersflipped)s)==3){ + kerns_dim[3]=PyArray_DIMS(%(filtersflipped)s)[2]; + kerns_dim[2]=PyArray_DIMS(%(filtersflipped)s)[1]; + kerns_dim[0]=PyArray_DIMS(%(filtersflipped)s)[0]; +}else if(PyArray_NDIM(%(filtersflipped)s)==4){ + kerns_dim[3]=PyArray_DIMS(%(filtersflipped)s)[3]; + kerns_dim[2]=PyArray_DIMS(%(filtersflipped)s)[2]; + kerns_dim[1]=PyArray_DIMS(%(filtersflipped)s)[1]; + kerns_dim[0]=PyArray_DIMS(%(filtersflipped)s)[0]; +}else{ + std::stringstream temp; + temp << "nddim="<= dim_im_log[0]){ + // the current row of the kernel is off the image + }else{ + int k = max((int)(pos_n-dim_im_log[1])+1,0); + int max_k=min(pos_n+1,(int)dim_ker_log[1]); + const %(type)s * idx_in=&in[ind0_phys*dim_im_phys[1]]; + for (int ind1_log=pos_n-k; k= PyArray_DIMS(%(z)s)[0]) %(fail)s; + if (kernel_idx >= PyArray_DIMS(%(z)s)[1]) %(fail)s; + if (img_row >= PyArray_DIMS(%(z)s)[2]) %(fail)s; + if (img_col >= PyArray_DIMS(%(z)s)[3]) %(fail)s; + } + z_p[0] += kbuf[img_row * kbufstride + kernel_idx]; + } + } + } + } +} +free(kbuf); +} +Py_XDECREF(img2d); +""" + + +def gen_conv_code_unroll_batch_kern(d, unroll_bsize=1, unroll_ksize=1): + """ + c_code for ConvOp that unroll the batch size loop. + + """ + assert unroll_bsize > 0 and unroll_ksize > 0 + if ( + "unroll_bsize" in d + or "unroll_ksize" in d + or "unroll_iter" in d + or "unroll_biter" in d + or "unroll_kiter" in d + ): + raise ValueError( + "We can't use this dictionary as we will overwrite some of its content" + ) + d = d.copy() + + d["unroll_bsize"] = unroll_bsize + d["unroll_ksize"] = unroll_ksize + + def my_dup(st, size): + s = "" + for i in range(size): + d["unroll_iter"] = i + s += st % d + return s + "\n" + + def my_dup2(st): + s = "" + iter = 0 + for i in range(unroll_bsize): + d["unroll_biter"] = i + for j in range(unroll_ksize): + d["unroll_kiter"] = j + d["unroll_iter"] = iter + iter += 1 + s += st % d + return s + "\n" + + ret = ( + """ +const int mode=%(mode)s; +int typenum=0, typenum_f=0; +PyArrayObject *ain1=NULL, *ain2=NULL, *filtersflipped_arr=NULL, *img2d_arr=NULL, *z_arr=NULL;; +const %(type)s fill_value = 0; + +int type_im=PyArray_TYPE(%(img2d)s); +int type_ker=PyArray_TYPE(%(filtersflipped)s); + +npy_intp dim_zz[2]={%(self_outshp0)s,%(self_outshp1)s}; +npy_intp dim_im[2]={%(self_imshp1)s,%(self_imshp2)s}; +const npy_intp dim_ker0=%(self_kshp0)s; +const npy_intp dim_ker1=%(self_kshp1)s; + +PyArray_Dims img2d_shape; +npy_intp img2d_dim[4]={1,1,0,0}; +img2d_shape.ptr=img2d_dim; +img2d_shape.len=4; + +PyArray_Dims kerns_shape; +npy_intp kerns_dim[4]={1,1,0,0}; +kerns_shape.ptr=kerns_dim; +kerns_shape.len=4; +PyObject *img2d=NULL, *contig, *filtersflipped=NULL; + +if(PyArray_NDIM(%(img2d)s)==2){ + img2d_dim[3]=PyArray_DIMS(%(img2d)s)[1]; + img2d_dim[2]=PyArray_DIMS(%(img2d)s)[0]; +}else if(PyArray_NDIM(%(img2d)s)==3){ + img2d_dim[3]=PyArray_DIMS(%(img2d)s)[2]; + img2d_dim[2]=PyArray_DIMS(%(img2d)s)[1]; + img2d_dim[0]=PyArray_DIMS(%(img2d)s)[0]; +}else if(PyArray_NDIM(%(img2d)s)==4){ + img2d_dim[3]=PyArray_DIMS(%(img2d)s)[3]; + img2d_dim[2]=PyArray_DIMS(%(img2d)s)[2]; + img2d_dim[1]=PyArray_DIMS(%(img2d)s)[1]; + img2d_dim[0]=PyArray_DIMS(%(img2d)s)[0]; +}else { + std::stringstream temp; + temp << "nddim="<= dim_im[0]){ + if(fill_value!=0) + for (int k=0; k < dim_ker1; k++) { +""" + % d + ) + ret += my_dup2("sum%(unroll_iter)s += idx_hvals%(unroll_kiter)s[k] * fill_value;") + ret += ( + """ + } + }else{ + //do the part where kernel is to the right of the img + + int k=0,max_k=max((int)(pos_n-dim_im[1])+1,0); + if(fill_value!=0){ + + for(k=0;k 1) +// We merge the 2 loop into one to make it easier to parallelize on both +// This is the equivalent of those 2 lines. +//for(int b=0;b< %(self_bsize)s;b++){ +// for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){ +for(int batch_kern_idx=0; + batch_kern_idx < %(self_bsize)s * %(self_nkern)s; + batch_kern_idx++){ + int b = batch_kern_idx / %(self_nkern)s; + int n_kern = batch_kern_idx %% %(self_nkern)s; + + %(type)s * __restrict__ out=(%(type)s *)(PyArray_GETPTR2(z_arr,b,n_kern)); + for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out[i] = 0; + + for(int stack_size=0;stack_size<%(self_imshp0)s;stack_size++){ + + const %(type)s * __restrict__ in=(%(type)s *)(PyArray_GETPTR2(img2d_arr,b,stack_size)); + const %(type)s * __restrict__ hvals=(%(type)s *)(PyArray_GETPTR2(filtersflipped_arr,n_kern,stack_size)); + + int new_m; + + for (int iter_m=0; iter_m < dim_zz[0]; iter_m++) { + // Reposition index into input image based on requested output size + int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image + if (mode == FULL) new_m = pos_m ; + else new_m = (pos_m+dim_ker0-1); + + for (int iter_n=0; iter_n < dim_zz[1]; iter_n++) { // loop over columns + int pos_n=iter_n*%(self_dy)s; + %(type)s sum=0; + %(type)s sum2=0; + %(type)s sum3=0; + %(type)s sum4=0; + int nb_sum=0; + // Sum over kernel, if index into image is out of bounds + // fill with the value + for (int j=0; j < dim_ker0; j++) { + int ind0 = (new_m-j); + + if(mode==FULL){ + const %(type)s * idx_hvals=&hvals[j*dim_ker1]; + if(ind0 < 0 || ind0 >= dim_im[0]){ + if(fill_value!=0) + for (int k=0; k < dim_ker1; k++) { + sum+= idx_hvals[k] * fill_value; + } + }else{ + //do the part where kernel is to the right of the img + int k=0,max_k=max((int)(pos_n-dim_im[1])+1,0); + if(fill_value!=0){ + + for(k=0;kdim_ker1-1 + && iter_ndim_ker1-1 + && iter_n=0; k--,im_idx++) { + sum+=idx_hvals[k]*idx_in[im_idx]; + sum2+=idx_hvals[k]*idx_in[im_idx+%(self_dy)s]; + sum3+=idx_hvals[k]*idx_in[im_idx+2*%(self_dy)s]; + sum4+=idx_hvals[k]*idx_in[im_idx+3*%(self_dy)s]; + } + }else if(iter_n + 2*%(self_dy)s < dim_zz[1]){ + nb_sum=2; + for (int k=dim_ker1-1,im_idx=pos_n; k >=0; k--,im_idx++) { + sum+=idx_hvals[k]*idx_in[im_idx]; + sum2+=idx_hvals[k]*idx_in[im_idx+%(self_dy)s]; + } + }else{ + nb_sum=1; + for (int k=dim_ker1-1,im_idx=pos_n; k >=0; k--,im_idx++) { + sum+=idx_hvals[k]*idx_in[im_idx]; + } + } + }//else valid mode + }//for j + switch(nb_sum){ + case 4: out[iter_m*dim_zz[1]+iter_n+3] %(affectation)s sum4; + case 3: out[iter_m*dim_zz[1]+iter_n+2] %(affectation)s sum3; + case 2: out[iter_m*dim_zz[1]+iter_n+1] %(affectation)s sum2; + case 1: out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum; + } + iter_n+=nb_sum-1; + }//for iter_n + }//for iter_m + }//for stack_size +}//for b and n_kern + +Py_XDECREF(img2d); +Py_XDECREF(filtersflipped); +""" diff --git a/pytensor/tensor/conv/conv3d2d.py b/pytensor/tensor/conv/conv3d2d.py new file mode 100644 index 0000000000..00a653013e --- /dev/null +++ b/pytensor/tensor/conv/conv3d2d.py @@ -0,0 +1,329 @@ +import pytensor +from pytensor import tensor as at +from pytensor.gradient import DisconnectedType +from pytensor.graph.basic import Apply +from pytensor.graph.op import Op +from pytensor.graph.rewriting.basic import ( + WalkingGraphRewriter, + copy_stack_trace, + node_rewriter, +) + + +def get_diagonal_subtensor_view(x, i0, i1): + """ + Helper function for DiagonalSubtensor and IncDiagonalSubtensor. + + Notes + ----- + It returns a partial view of x, not a partial copy. + + """ + # We have to cast i0 and i0 to int because python + # do not support indexing with 0-dim, 'int*' ndarrays. + i0 = int(i0) + i1 = int(i1) + if x.shape[i0] < x.shape[i1]: + raise NotImplementedError("is this allowed?") + idx = [slice(None)] * x.ndim + idx[i0] = slice(x.shape[i1] - 1, None, None) + xview = x.__getitem__(tuple(idx)) + strides = list(xview.strides) + if x.shape[i1] != 1: + strides[i1] -= strides[i0] + xview.strides = strides + return xview + + +class DiagonalSubtensor(Op): + """ + Return a form a nd diagonal subtensor. + + Parameters + ---------- + x + n-d tensor + i0 + Axis index in x + i1 + Axis index in x + + + Extended summary + ---------------- + ``x`` is some n-dimensional tensor, but this Op only deals with a + matrix-shaped slice, using axes i0 and i1. Without loss of + generality, suppose that ``i0`` picks out our ``row`` dimension, + and i1 the ``column`` dimension. + + So the relevant part of ``x`` is some matrix ``u``. Suppose it has 7 rows + and 4 columns:: + + [ 0 0 0 0 ] + [ 0 0 0 0 ] + [ 0 0 0 0 ] + [ 0 0 0 0 ] + [ 0 0 0 0 ] + [ 0 0 0 0 ] + + The view returned by this function is also a matrix. It's a thick, + diagonal ``stripe`` across u that discards the lower left triangle + and the upper right triangle: + + [ x 0 0 0 ] + [ x x 0 0 ] + [ x x x 0 ] + [ 0 x x x ] + [ 0 0 x x ] + [ 0 0 0 x ] + + In this case the return value would be this view of shape 3x4. The + returned view has the same number of dimensions as the input + ``x``, and the only difference is that the shape along dimension + ``i0`` has been reduced by ``shape[i1] - 1`` because of the + triangles that got chopped out. + + The NotImplementedError is meant to catch the case where shape[i0] + is too small for the stripe to reach across the matrix, in which + case it's not clear what this function should do. Maybe always + raise an error. I'd look back to the call site in the Conv3D to + see what's necessary at that point. + + """ + + __props__ = ("inplace",) + + def __str__(self): + if self.inplace: + return "%s{inplace}" % self.__class__.__name__ + return f"{self.__class__.__name__}" + + def __init__(self, inplace=False): + self.inplace = inplace + if inplace: + self.view_map = {0: [0]} + + def make_node(self, x, i0, i1): + _i0 = at.as_tensor_variable(i0) + _i1 = at.as_tensor_variable(i1) + # TODO: We could produce a more precise static shape output type + type_shape = (1 if shape == 1 else None for shape in x.type.shape) + out_type = at.TensorType(x.type.dtype, shape=type_shape) + return Apply(self, [x, _i0, _i1], [out_type()]) + + def perform(self, node, inputs, output_storage): + xview = get_diagonal_subtensor_view(*inputs) + if self.inplace: + output_storage[0][0] = xview + else: + output_storage[0][0] = xview.copy() + + def grad(self, inputs, g_outputs): + z = at.zeros_like(inputs[0]) + gx = inc_diagonal_subtensor(z, inputs[1], inputs[2], g_outputs[0]) + return [gx, DisconnectedType()(), DisconnectedType()()] + + def connection_pattern(self, node): + rval = [[True], [False], [False]] + return rval + + +diagonal_subtensor = DiagonalSubtensor(False) + + +class IncDiagonalSubtensor(Op): + """ + The gradient of DiagonalSubtensor. + + """ + + __props__ = ("inplace",) + + def __str__(self): + if self.inplace: + return "%s{inplace}" % self.__class__.__name__ + return f"{self.__class__.__name__}" + + def __init__(self, inplace=False): + self.inplace = inplace + if inplace: + self.destroy_map = {0: [0]} + + def make_node(self, x, i0, i1, amt): + _i0 = at.as_tensor_variable(i0) + _i1 = at.as_tensor_variable(i1) + return Apply(self, [x, _i0, _i1, amt], [x.type()]) + + def perform(self, node, inputs, output_storage): + x, i0, i1, amt = inputs + if not self.inplace: + x = x.copy() + xview = get_diagonal_subtensor_view(x, i0, i1) + xview += amt + output_storage[0][0] = x + + def grad(self, inputs, g_outputs): + x, i0, i1, amt = inputs + gy = g_outputs[0] + return [ + gy, + DisconnectedType()(), + DisconnectedType()(), + diagonal_subtensor(gy, i0, i1), + ] + + def connection_pattern(self, node): + rval = [[True], [False], [False], [True]] + return rval + + +inc_diagonal_subtensor = IncDiagonalSubtensor(False) + + +def conv3d( + signals, filters, signals_shape=None, filters_shape=None, border_mode="valid" +): + """ + Convolve spatio-temporal filters with a movie. + + It flips the filters. + + Parameters + ---------- + signals + Timeseries of images whose pixels have color channels. + Shape: [Ns, Ts, C, Hs, Ws]. + filters + Spatio-temporal filters. + Shape: [Nf, Tf, C, Hf, Wf]. + signals_shape + None or a tuple/list with the shape of signals. + filters_shape + None or a tuple/list with the shape of filters. + border_mode + One of 'valid', 'full' or 'half'. + + Notes + ----- + Another way to define signals: (batch, time, in channel, row, column) + Another way to define filters: (out channel,time,in channel, row, column) + + See Also + -------- + Someone made a script that shows how to swap the axes between + both 3d convolution implementations in PyTensor. See the last + `attachment `_ + + """ + + if isinstance(border_mode, str): + border_mode = (border_mode, border_mode, border_mode) + + if signals_shape is None: + _signals_shape_5d = signals.shape + else: + _signals_shape_5d = signals_shape + + if filters_shape is None: + _filters_shape_5d = filters.shape + else: + _filters_shape_5d = filters_shape + + Ns, Ts, C, Hs, Ws = _signals_shape_5d + Nf, Tf, C, Hf, Wf = _filters_shape_5d + + _signals_shape_4d = (Ns * Ts, C, Hs, Ws) + _filters_shape_4d = (Nf * Tf, C, Hf, Wf) + + if border_mode[1] != border_mode[2]: + raise NotImplementedError("height and width bordermodes must match") + conv2d_signal_shape = _signals_shape_4d + conv2d_filter_shape = _filters_shape_4d + if signals_shape is None: + conv2d_signal_shape = None + if filters_shape is None: + conv2d_filter_shape = None + + out_4d = pytensor.tensor.conv.conv2d( + signals.reshape(_signals_shape_4d), + filters.reshape(_filters_shape_4d), + input_shape=conv2d_signal_shape, + filter_shape=conv2d_filter_shape, + border_mode=border_mode[1], + ) # ignoring border_mode[2] + + # compute the intended output size + if border_mode[1] == "valid": + Hout = Hs - Hf + 1 + Wout = Ws - Wf + 1 + elif border_mode[1] == "full": + Hout = Hs + Hf - 1 + Wout = Ws + Wf - 1 + elif border_mode[1] == "half": + Hout = Hs - (Hf % 2) + 1 + Wout = Ws - (Wf % 2) + 1 + elif border_mode[1] == "same": + raise NotImplementedError() + else: + raise ValueError("invalid border mode", border_mode[1]) + + # reshape the temporary output to restore its original size + out_tmp = out_4d.reshape((Ns, Ts, Nf, Tf, Hout, Wout)) + + # now sum out along the Tf to get the output + # but we have to sum on a diagonal through the Tf and Ts submatrix. + if Tf == 1: + # for Tf==1, no sum along Tf, the Ts-axis of the output is unchanged! + out_5d = out_tmp.reshape((Ns, Ts, Nf, Hout, Wout)) + else: + # for some types of convolution, pad out_tmp with zeros + if border_mode[0] == "valid": + Tpad = 0 + elif border_mode[0] == "full": + Tpad = Tf - 1 + elif border_mode[0] == "half": + Tpad = Tf // 2 + elif border_mode[0] == "same": + raise NotImplementedError() + else: + raise ValueError("invalid border mode", border_mode[0]) + + if Tpad == 0: + out_5d = diagonal_subtensor(out_tmp, 1, 3).sum(axis=3) + else: + # pad out_tmp with zeros before summing over the diagonal + out_tmp_padded = at.zeros( + dtype=out_tmp.dtype, shape=(Ns, Ts + 2 * Tpad, Nf, Tf, Hout, Wout) + ) + out_tmp_padded = pytensor.tensor.subtensor.set_subtensor( + out_tmp_padded[:, Tpad : (Ts + Tpad), :, :, :, :], out_tmp + ) + out_5d = diagonal_subtensor(out_tmp_padded, 1, 3).sum(axis=3) + + return out_5d + + +@node_rewriter([DiagonalSubtensor, IncDiagonalSubtensor]) +def local_inplace_DiagonalSubtensor(fgraph, node): + """Also work for IncDiagonalSubtensor.""" + if ( + isinstance(node.op, (DiagonalSubtensor, IncDiagonalSubtensor)) + and not node.op.inplace + ): + new_op = node.op.__class__(inplace=True) + new_node = new_op(*node.inputs) + copy_stack_trace(node.outputs[0], new_node) + return [new_node] + return False + + +pytensor.compile.optdb.register( + "local_inplace_DiagonalSubtensor", + WalkingGraphRewriter( + local_inplace_DiagonalSubtensor, + failure_callback=WalkingGraphRewriter.warn_inplace, + ), + "fast_run", + "inplace", + position=60, +) diff --git a/tests/tensor/conv/c_conv_corr_ref.py b/pytensor/tensor/conv/corr.py similarity index 100% rename from tests/tensor/conv/c_conv_corr_ref.py rename to pytensor/tensor/conv/corr.py diff --git a/tests/tensor/conv/c_conv3d_corr3d_ref.py b/pytensor/tensor/conv/corr3d.py similarity index 100% rename from tests/tensor/conv/c_conv3d_corr3d_ref.py rename to pytensor/tensor/conv/corr3d.py diff --git a/pytensor/tensor/conv/rewriting.py b/pytensor/tensor/conv/rewriting.py new file mode 100644 index 0000000000..b56efd8ea6 --- /dev/null +++ b/pytensor/tensor/conv/rewriting.py @@ -0,0 +1,546 @@ +""" +Optimizations addressing the ops in nnet root directory +""" + +import pytensor +from pytensor.compile import optdb +from pytensor.configdefaults import config +from pytensor.graph.rewriting.basic import ( + MetaNodeRewriterSkip, + copy_stack_trace, + in2out, + node_rewriter, +) +from pytensor.tensor.conv.abstract_conv import ( + AbstractConv2d, + AbstractConv2d_gradInputs, + AbstractConv2d_gradWeights, + AbstractConv3d, + AbstractConv3d_gradInputs, + AbstractConv3d_gradWeights, + get_conv_output_shape, +) + +# Cpu implementation +from pytensor.tensor.conv.conv import ConvOp, conv2d +from pytensor.tensor.conv.corr import CorrMM, CorrMM_gradInputs, CorrMM_gradWeights +from pytensor.tensor.conv.corr3d import ( + Corr3dMM, + Corr3dMMGradInputs, + Corr3dMMGradWeights, +) +from pytensor.tensor.type import TensorType + + +# Conv opts +@node_rewriter([AbstractConv2d]) +def local_abstractconv_gemm(fgraph, node): + # If config.blas__ldflags is empty, PyTensor will use + # a NumPy C implementation of [sd]gemm_. + if config.cxx == "" or node.inputs[0].dtype == "float16": + return + if not isinstance(node.op, AbstractConv2d): + return None + img, kern = node.inputs + if not isinstance(img.type, TensorType) or not isinstance(kern.type, TensorType): + return None + + # need to flip the kernel if necessary + if node.op.filter_flip: + flip = (slice(None),) * (kern.ndim - 2) + (slice(None, None, -1),) * 2 + kern = kern[flip] + rval = CorrMM( + border_mode=node.op.border_mode, + subsample=node.op.subsample, + filter_dilation=node.op.filter_dilation, + num_groups=node.op.num_groups, + unshared=node.op.unshared, + )(img, kern) + copy_stack_trace(node.outputs[0], rval) + + return [rval] + + +@node_rewriter([AbstractConv3d]) +def local_abstractconv3d_gemm(fgraph, node): + # If config.blas__ldflags is empty, PyTensor will use + # a NumPy C implementation of [sd]gemm_. + if config.cxx == "" or node.inputs[0].dtype == "float16": + return + if not isinstance(node.op, AbstractConv3d): + return None + img, kern = node.inputs + if not isinstance(img.type, TensorType) or not isinstance(kern.type, TensorType): + return None + + # need to flip the kernel if necessary + if node.op.filter_flip: + kern = kern[:, :, ::-1, ::-1, ::-1] + rval = Corr3dMM( + border_mode=node.op.border_mode, + subsample=node.op.subsample, + filter_dilation=node.op.filter_dilation, + num_groups=node.op.num_groups, + )(img, kern) + copy_stack_trace(node.outputs[0], rval) + + return [rval] + + +@node_rewriter([AbstractConv2d_gradWeights]) +def local_abstractconv_gradweight_gemm(fgraph, node): + # If config.blas__ldflags is empty, PyTensor will use + # a NumPy C implementation of [sd]gemm_. + if config.cxx == "" or node.inputs[0].dtype == "float16": + return + if not isinstance(node.op, AbstractConv2d_gradWeights): + return None + img, topgrad, shape = node.inputs + if not isinstance(img.type, TensorType) or not isinstance(topgrad.type, TensorType): + return None + + rval = CorrMM_gradWeights( + border_mode=node.op.border_mode, + subsample=node.op.subsample, + filter_dilation=node.op.filter_dilation, + num_groups=node.op.num_groups, + unshared=node.op.unshared, + )(img, topgrad, shape) + copy_stack_trace(node.outputs[0], rval) + + # need to flip the kernel if necessary + if node.op.filter_flip: + flip = (slice(None),) * (rval.ndim - 2) + (slice(None, None, -1),) * 2 + rval = rval[flip] + copy_stack_trace(node.outputs[0], rval) + + return [rval] + + +@node_rewriter([AbstractConv3d_gradWeights]) +def local_abstractconv3d_gradweight_gemm(fgraph, node): + # If config.blas__ldflags is empty, PyTensor will use + # a NumPy C implementation of [sd]gemm_. + if config.cxx == "" or node.inputs[0].dtype == "float16": + return + if not isinstance(node.op, AbstractConv3d_gradWeights): + return None + img, topgrad, shape = node.inputs + if not isinstance(img.type, TensorType) or not isinstance(topgrad.type, TensorType): + return None + + rval = Corr3dMMGradWeights( + border_mode=node.op.border_mode, + subsample=node.op.subsample, + filter_dilation=node.op.filter_dilation, + num_groups=node.op.num_groups, + )(img, topgrad, shape) + copy_stack_trace(node.outputs[0], rval) + + # need to flip the kernel if necessary + if node.op.filter_flip: + rval = rval[:, :, ::-1, ::-1, ::-1] + copy_stack_trace(node.outputs[0], rval) + + return [rval] + + +@node_rewriter([AbstractConv2d_gradInputs]) +def local_abstractconv_gradinputs_gemm(fgraph, node): + # If config.blas__ldflags is empty, PyTensor will use + # a NumPy C implementation of [sd]gemm_. + if config.cxx == "" or node.inputs[0].dtype == "float16": + return + if not isinstance(node.op, AbstractConv2d_gradInputs): + return None + kern, topgrad, shape = node.inputs + if not isinstance(kern.type, TensorType) or not isinstance( + topgrad.type, TensorType + ): + return None + + # need to flip the kernel if necessary + if node.op.filter_flip: + flip = (slice(None),) * (kern.ndim - 2) + (slice(None, None, -1),) * 2 + kern = kern[flip] + rval = CorrMM_gradInputs( + border_mode=node.op.border_mode, + subsample=node.op.subsample, + filter_dilation=node.op.filter_dilation, + num_groups=node.op.num_groups, + unshared=node.op.unshared, + )(kern, topgrad, shape) + copy_stack_trace(node.outputs[0], rval) + + return [rval] + + +@node_rewriter([AbstractConv3d_gradInputs]) +def local_abstractconv3d_gradinputs_gemm(fgraph, node): + # If config.blas__ldflags is empty, PyTensor will use + # a NumPy C implementation of [sd]gemm_. + if config.cxx == "" or node.inputs[0].dtype == "float16": + return + if not isinstance(node.op, AbstractConv3d_gradInputs): + return None + kern, topgrad, shape = node.inputs + if not isinstance(kern.type, TensorType) or not isinstance( + topgrad.type, TensorType + ): + return None + + # need to flip the kernel if necessary + if node.op.filter_flip: + kern = kern[:, :, ::-1, ::-1, ::-1] + rval = Corr3dMMGradInputs( + border_mode=node.op.border_mode, + subsample=node.op.subsample, + filter_dilation=node.op.filter_dilation, + num_groups=node.op.num_groups, + )(kern, topgrad, shape) + copy_stack_trace(node.outputs[0], rval) + + return [rval] + + +@node_rewriter([AbstractConv2d]) +def local_conv2d_cpu(fgraph, node): + if not isinstance(node.op, AbstractConv2d) or node.inputs[0].dtype == "float16": + return None + + img, kern = node.inputs + if not isinstance(img.type, TensorType) or not isinstance(kern.type, TensorType): + return None + if node.op.border_mode not in ("full", "valid"): + return None + if not node.op.filter_flip: + # Not tested yet + return None + if node.op.num_groups > 1 or node.op.unshared: + return None + if node.op.filter_dilation != (1, 1): + return None + + rval = conv2d( + img, + kern, + node.op.imshp, + node.op.kshp, + border_mode=node.op.border_mode, + subsample=node.op.subsample, + ) + + copy_stack_trace(node.outputs[0], rval) + return [rval] + + +@node_rewriter([AbstractConv2d_gradWeights]) +def local_conv2d_gradweight_cpu(fgraph, node): + if ( + not isinstance(node.op, AbstractConv2d_gradWeights) + or node.inputs[0].dtype == "float16" + ): + return None + + img, topgrad, shape = node.inputs + + if not isinstance(img.type, TensorType) or not isinstance(topgrad.type, TensorType): + return None + if node.op.border_mode not in ("full", "valid"): + return None + if not node.op.filter_flip: + # Not tested yet + return + if node.op.num_groups > 1 or node.op.unshared: + return None + + if node.op.border_mode == "valid" and (node.op.subsample != (1, 1)): + return None + + dx, dy = node.op.subsample + if dx not in (1, 2) or dy not in (1, 2): + # Not implemented in the gradient of ConvOp + return None + + if node.op.imshp is None: + op_imshp = (None, None, None, None) + else: + op_imshp = node.op.imshp + + if node.op.kshp is None: + op_kshp = (None, None, None, None) + else: + op_kshp = node.op.kshp + + if None in op_imshp or None in op_kshp: + if (dx, dy) != (1, 1): + # We cannot infer the shapes + return None + + # Determine gradient on kernels + assert len(op_imshp) == 4 and len(op_kshp) == 4 + + outshp = get_conv_output_shape( + op_imshp, + op_kshp, + node.op.border_mode, + node.op.subsample, + node.op.filter_dilation, + )[2:] + fulloutshp = get_conv_output_shape(op_imshp, op_kshp, node.op.border_mode, (1, 1))[ + 2: + ] + + newimg = img.dimshuffle((1, 0, 2, 3)) + newtopgrad = topgrad.dimshuffle((1, 0, 2, 3)) + + if node.op.border_mode == "valid": + (img, filters) = (newimg, newtopgrad) + kshp_logical = fulloutshp + kshp_logical_top_aligned = False + imshp_logical = None + (bsize, nkern) = (op_imshp[1], op_kshp[0]) + imshp = (op_imshp[0], op_imshp[2], op_imshp[3]) + kshp = outshp + elif node.op.border_mode == "full": + (img, filters) = (newtopgrad, newimg) + kshp_logical = None + kshp_logical_top_aligned = True + imshp_logical = (op_imshp[0], fulloutshp[0], fulloutshp[1]) + (bsize, nkern) = (op_kshp[0], op_imshp[1]) + imshp = (op_imshp[0], outshp[0], outshp[1]) + kshp = op_imshp[2:] + else: + raise NotImplementedError("Only [full,valid] modes are currently supported.") + + # Flip the kernels + filters = filters[:, :, ::-1, ::-1] + + dw = ConvOp( + imshp, + kshp, + nkern, + bsize, + 1, + 1, + output_mode="valid", + unroll_batch=None, + unroll_kern=None, + unroll_patch=None, + imshp_logical=imshp_logical, + kshp_logical=kshp_logical, + kshp_logical_top_aligned=kshp_logical_top_aligned, + direction_hint="bprop weights", + ) + res = dw(img, filters) + copy_stack_trace(node.outputs[0], res) + + if node.op.border_mode == "valid": + res = res.dimshuffle((1, 0, 2, 3)) + res = res[:, :, ::-1, ::-1] + copy_stack_trace(node.outputs[0], res) + + return [res] + + +@node_rewriter([AbstractConv2d_gradInputs]) +def local_conv2d_gradinputs_cpu(fgraph, node): + if ( + not isinstance(node.op, AbstractConv2d_gradInputs) + or node.inputs[0].dtype == "float16" + ): + return None + + kern, topgrad, shape = node.inputs + + if not isinstance(kern.type, TensorType) or not isinstance( + topgrad.type, TensorType + ): + return None + if node.op.border_mode not in ("full", "valid"): + return None + if not node.op.filter_flip: + # Not tested yet + return None + if node.op.num_groups > 1 or node.op.unshared: + return None + + # Conv 3d implementation, needed when subsample > 2 + if node.op.border_mode == "valid" and node.op.subsample != (1, 1): + # The op don't support that anymore. + return False + + # Conv2d Implementation + dx, dy = node.op.subsample + if dx not in (1, 2) or dy not in (1, 2): + # Not implemented in the gradient of ConvOp + return None + + if node.op.imshp is None: + op_imshp = (None, None, None, None) + else: + op_imshp = node.op.imshp + + if node.op.kshp is None: + op_kshp = (None, None, None, None) + else: + op_kshp = node.op.kshp + + if None in op_imshp or None in op_kshp: + if (dx, dy) != (1, 1): + return None + + mode = "valid" + if node.op.border_mode != "full": + mode = "full" + filters = kern.dimshuffle((1, 0, 2, 3)) + filters = filters[:, :, ::-1, ::-1] + + outshp = get_conv_output_shape( + op_imshp, + op_kshp, + node.op.border_mode, + node.op.subsample, + node.op.filter_dilation, + )[2:] + fulloutshp = get_conv_output_shape(op_imshp, op_kshp, node.op.border_mode, (1, 1))[ + 2: + ] + + nkern = op_imshp[1] + imshp = (op_kshp[0], outshp[0], outshp[1]) + imshp_logical = (op_kshp[0], fulloutshp[0], fulloutshp[1]) + din = ConvOp( + imshp, + op_kshp[2:], + nkern, + op_imshp[0], + 1, + 1, + output_mode=mode, + unroll_batch=None, + unroll_kern=None, + unroll_patch=None, + imshp_logical=imshp_logical, + kshp_logical=None, + version=-1, + direction_hint="bprop inputs", + ) + din = din(topgrad, filters) + copy_stack_trace(node.outputs[0], din) + return [din] + + +# Register Cpu Optimization +conv_groupopt = pytensor.graph.rewriting.db.LocalGroupDB() +conv_groupopt.__name__ = "conv_opts" + +# GEMM-based convolution +# It can be disabled by excluding 'conv_gemm'. +conv_groupopt.register( + "local_abstractconv_gemm", + local_abstractconv_gemm, + "conv_gemm", + "fast_compile", + "fast_run", + position=30, +) +conv_groupopt.register( + "local_abstractconv_gradweight_gemm", + local_abstractconv_gradweight_gemm, + "conv_gemm", + "fast_compile", + "fast_run", + position=30, +) +conv_groupopt.register( + "local_abstractconv_gradinputs_gemm", + local_abstractconv_gradinputs_gemm, + "conv_gemm", + "fast_compile", + "fast_run", + position=30, +) +conv_groupopt.register( + "local_abstractconv3d_gemm", + local_abstractconv3d_gemm, + "conv_gemm", + "fast_compile", + "fast_run", + position=30, +) +conv_groupopt.register( + "local_abstractconv3d_gradweight_gemm", + local_abstractconv3d_gradweight_gemm, + "conv_gemm", + "fast_compile", + "fast_run", + position=30, +) +conv_groupopt.register( + "local_abstractconv3d_gradinputs_gemm", + local_abstractconv3d_gradinputs_gemm, + "conv_gemm", + "fast_compile", + "fast_run", + position=30, +) + +# Legacy convolution +conv_groupopt.register( + "local_conv2d_cpu", local_conv2d_cpu, "fast_compile", "fast_run", position=40 +) +conv_groupopt.register( + "local_conv2d_gradweight_cpu", + local_conv2d_gradweight_cpu, + "fast_compile", + "fast_run", + position=40, +) +conv_groupopt.register( + "local_conv2d_gradinputs_cpu", + local_conv2d_gradinputs_cpu, + "fast_compile", + "fast_run", + position=40, +) + + +# Verify that no AbstractConv are present in the graph +@node_rewriter( + [ + AbstractConv2d, + AbstractConv2d_gradWeights, + AbstractConv2d_gradInputs, + AbstractConv3d, + AbstractConv3d_gradWeights, + AbstractConv3d_gradInputs, + ] +) +def local_abstractconv_check(fgraph, node): + if isinstance( + node.op, + ( + AbstractConv2d, + AbstractConv2d_gradWeights, + AbstractConv2d_gradInputs, + AbstractConv3d, + AbstractConv3d_gradWeights, + AbstractConv3d_gradInputs, + ), + ): + raise MetaNodeRewriterSkip( + f"{node.op.__class__.__name__} PyTensor rewriting failed: there is no implementation " + "available supporting the requested options. If on CPU, " + "do you have a BLAS library installed PyTensor can link against? " + "On the CPU we do not support float16." + ) + + +optdb.register( + "AbstractConvCheck", + in2out(local_abstractconv_check, name="AbstractConvCheck"), + "fast_compile", + "fast_run", + position=48.7, +) diff --git a/pytensor/tensor/signal/__init__.py b/pytensor/tensor/signal/__init__.py new file mode 100644 index 0000000000..ed39208de5 --- /dev/null +++ b/pytensor/tensor/signal/__init__.py @@ -0,0 +1,4 @@ +from pytensor.tensor.signal.conv import convolve + + +__all__ = ("convolve",) diff --git a/pytensor/tensor/signal/conv.py b/pytensor/tensor/signal/conv.py new file mode 100644 index 0000000000..62164c6dad --- /dev/null +++ b/pytensor/tensor/signal/conv.py @@ -0,0 +1,111 @@ +from typing import TYPE_CHECKING, Literal + +from numpy import convolve as numpy_convolve + +from pytensor.graph import Apply, Op +from pytensor.scalar.basic import upcast +from pytensor.tensor.basic import as_tensor_variable, join, zeros +from pytensor.tensor.blockwise import Blockwise +from pytensor.tensor.math import maximum, minimum +from pytensor.tensor.type import vector +from pytensor.tensor.variable import TensorVariable + + +if TYPE_CHECKING: + from pytensor.tensor import TensorLike + + +class Conv1d(Op): + __props__ = ("mode",) + gufunc_signature = "(n),(k)->(o)" + + def __init__(self, mode: Literal["full", "valid"] = "full"): + if mode not in ("full", "valid"): + raise ValueError(f"Invalid mode: {mode}") + self.mode = mode + + def make_node(self, data, kernel): + data = as_tensor_variable(data) + kernel = as_tensor_variable(kernel) + + assert data.ndim == 1 + assert kernel.ndim == 1 + + dtype = upcast(data.dtype, kernel.dtype) + + n = data.type.shape[0] + k = kernel.type.shape[0] + + if n is None or k is None: + out_shape = (None,) + elif self.mode == "full": + out_shape = (n + k - 1,) + else: # mode == "valid": + out_shape = (max(n, k) - min(n, k) + 1,) + + out = vector(dtype=dtype, shape=out_shape) + return Apply(self, [data, kernel], [out]) + + def perform(self, node, inputs, outputs): + data, kernel = inputs + # We use numpy_convolve as that's what scipy would use if method="direct" was passed. + # And mode != "same", which this Op doesn't cover anyway. + outputs[0][0] = numpy_convolve(data, kernel, mode=self.mode) + + def infer_shape(self, fgraph, node, shapes): + data_shape, kernel_shape = shapes + n = data_shape[0] + k = kernel_shape[0] + if self.mode == "full": + shape = n + k - 1 + else: # mode == "valid": + shape = maximum(n, k) - minimum(n, k) + 1 + return [[shape]] + + def L_op(self, inputs, outputs, output_grads): + data, kernel = inputs + [grad] = output_grads + + if self.mode == "full": + valid_conv = type(self)(mode="valid") + data_bar = valid_conv(grad, kernel[::-1]) + kernel_bar = valid_conv(grad, data[::-1]) + + else: # mode == "valid": + full_conv = type(self)(mode="full") + n = data.shape[0] + k = kernel.shape[0] + kmn = maximum(0, k - n) + nkm = maximum(0, n - k) + # We need mode="full" if k >= n else "valid" for data_bar (opposite for kernel_bar), but mode is not symbolic. + # Instead, we always use mode="full" and slice the result so it behaves like "valid" for the input that's shorter. + data_bar = full_conv(grad, kernel[::-1]) + data_bar = data_bar[kmn : data_bar.shape[0] - kmn] + kernel_bar = full_conv(grad, data[::-1]) + kernel_bar = kernel_bar[nkm : kernel_bar.shape[0] - nkm] + + return [data_bar, kernel_bar] + + +def convolve( + data: "TensorLike", + kernel: "TensorLike", + mode: Literal["full", "valid", "same"] = "full", +) -> TensorVariable: + data = as_tensor_variable(data) + kernel = as_tensor_variable(kernel) + + if mode == "same": + # We implement "same" as "valid" with padded data. + data_batch_shape = tuple(data.shape)[:-1] + zeros_left = kernel.shape[0] // 2 + zeros_right = (kernel.shape[0] - 1) // 2 + data = join( + -1, + zeros((*data_batch_shape, zeros_left), dtype=kernel.dtype), + data, + zeros((*data_batch_shape, zeros_right), dtype=kernel.dtype), + ) + mode = "valid" + + return Blockwise(Conv1d(mode=mode))(data, kernel) diff --git a/tests/link/jax/signal/__init__.py b/tests/link/jax/signal/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/link/jax/signal/conv.py b/tests/link/jax/signal/conv.py new file mode 100644 index 0000000000..ad2e42e3ff --- /dev/null +++ b/tests/link/jax/signal/conv.py @@ -0,0 +1,18 @@ +import numpy as np +import pytest +from link.jax.test_basic import compare_jax_and_py + +from pytensor.tensor import matrix +from pytensor.tensor.signal import convolve + + +@pytest.mark.parametrize("mode", ["full", "valid", "same"]) +def test_conv(mode): + x = matrix("x") + y = matrix("y") + out = convolve(x[None], y[:, None], mode=mode) + + rng = np.random.default_rng() + test_x = rng.normal(size=(3, 5)) + test_y = rng.normal(size=(7, 11)) + compare_jax_and_py([x, y], out, [test_x, test_y]) diff --git a/tests/link/numba/signal/test_signal.py b/tests/link/numba/signal/test_signal.py new file mode 100644 index 0000000000..f9df39d157 --- /dev/null +++ b/tests/link/numba/signal/test_signal.py @@ -0,0 +1,22 @@ +import numpy as np +import pytest +from link.numba.test_basic import compare_numba_and_py + +from pytensor.tensor import matrix +from pytensor.tensor.signal import convolve + + +pytestmark = pytest.mark.filterwarnings("error") + + +@pytest.mark.parametrize("mode", ["full", "valid", "same"]) +def test_conv(mode): + x = matrix("x") + y = matrix("y") + out = convolve(x[None], y[:, None], mode=mode) + + rng = np.random.default_rng() + test_x = rng.normal(size=(3, 5)) + test_y = rng.normal(size=(7, 11)) + # Object mode is not supported for numba + compare_numba_and_py([x, y], out, [test_x, test_y], eval_obj_mode=False) diff --git a/tests/tensor/conv/speed_test_conv.py b/tests/tensor/conv/speed_test_conv.py new file mode 100644 index 0000000000..71805c5142 --- /dev/null +++ b/tests/tensor/conv/speed_test_conv.py @@ -0,0 +1,451 @@ +import time + +import numpy as np + +from pytensor import function +from pytensor.compile.mode import Mode +from pytensor.tensor.conv.conv import ConvOp +from pytensor.tensor.type import TensorType, dmatrix + + +def flip(kern, kshp): + "flip the kernel as scipy.convolv2d do it flipped." + flip = np.zeros(kern.shape) + if len(kern.shape) == 2: + kern = kern.reshape(-1) + it = reversed(kern) + for i in range(kshp[0]): + for j in range(kshp[1]): + flip[i, j] = next(it) + elif len(kern.shape) == 3: + kern = kern.reshape(kern.shape[0], -1) + for k in range(kern.shape[0]): + it = reversed(kern[k, :]) + for i in range(kshp[0]): + for j in range(kshp[1]): + flip[k, i, j] = next(it) + elif len(kern.shape) == 4: + kern = kern.reshape(kern.shape[0], kern.shape[1], -1) + for k in range(kern.shape[0]): + for m in range(kern.shape[1]): + it = reversed(kern[k, m, :]) + for i in range(kshp[0]): + for j in range(kshp[1]): + flip[k, m, i, j] = next(it) + else: + raise NotImplementedError() + return flip + + +global_rng = np.random.default_rng(3423489) + +dmatrix4 = TensorType("float64", shape=(None, None, None, None)) + + +def exec_multilayer_conv_nnet_old( + conv_mode, + ss, + bsize, + imshp, + kshps, + nkerns, + unroll_batch=0, + unroll_kern=0, + img=None, + validate=True, + conv_op_py=False, + do_print=True, + repeat=1, + unroll_patch=False, + unroll_patch_size=False, + verbose=0, +): + if img is None: + img = dmatrix() + + # build actual input images + imgval = global_rng.random((bsize, imshp[0], imshp[1], imshp[2])) + + a = dmatrix() + kerns = [a for i in nkerns] + inputs4 = dmatrix4() + kerns4 = dmatrix4() + + # for each layer + ntot = 0 + tctot = 0 + tpytot = 0 + + for kshp, kern, nkern, n_layer in zip(kshps, kerns, nkerns, range(len(nkerns))): + if do_print: + print("************* layer %i ***************" % n_layer) + print(conv_mode, ss, n_layer, kshp, nkern) + + # actual values + w = global_rng.random(np.r_[nkern, imshp[0], kshp]) + w_flip = flip(w, kshp).reshape(w.shape) + + # manual implementation + # check first stage + padimg = imgval + if conv_mode == "full": + padimg_shp = np.array(imshp[1:]) + 2 * (np.array(kshp) - np.array([1, 1])) + padimg = np.zeros(np.r_[bsize, imshp[0], padimg_shp]) + padimg[ + :, :, kshp[0] - 1 : -kshp[0] + 1, kshp[1] - 1 : -kshp[1] + 1 + ] = imgval + + outshp = np.hstack( + (nkern, ConvOp.getOutputShape(imshp[1:], kshp, ss, conv_mode)) + ) + + time1 = time.perf_counter() + outval = np.zeros(np.r_[bsize, outshp]) + if validate: + # causes an atexit problem + + try: + from scipy.signal.signaltools import _bvalfromboundary, _valfrommode + from scipy.signal.sigtools import _convolve2d + except ImportError: + from scipy.signal._signaltools import _bvalfromboundary, _valfrommode + from scipy.signal._sigtools import _convolve2d + + val = _valfrommode(conv_mode) + bval = _bvalfromboundary("fill") + for b in range(bsize): # loop over batches + for n in range(nkern): # loop over filters + for i in range(imshp[0]): # loop over input feature maps + outval[b, n, ...] += _convolve2d( + imgval[b, i, ...], w_flip[n, i, ...], 1, val, bval, 0 + )[0 :: ss[0], 0 :: ss[1]] + ntot += time.perf_counter() - time1 + + # ConvOp + if unroll_patch and not unroll_patch_size: + conv_op = ConvOp( + dx=ss[0], + dy=ss[1], + output_mode=conv_mode, + unroll_patch=unroll_patch, + verbose=verbose, + )(inputs4, kerns4) + else: + conv_op = ConvOp( + imshp, + kshp, + nkern, + bsize, + ss[0], + ss[1], + conv_mode, + unroll_batch=unroll_batch, + unroll_kern=unroll_kern, + unroll_patch=unroll_patch, + verbose=verbose, + )(inputs4, kerns4) + # l1shp = np.hstack((nkern, + # ConvOp.getOutputShape(imshp[1:], kshp, ss, conv_mode))) + propup2 = function([inputs4, kerns4], conv_op) + propup3 = function([inputs4, kerns4], conv_op, mode=Mode(linker="py")) + + time1 = time.perf_counter() + for i in range(repeat): + hidval2_ = propup2(imgval, w_flip) + hidval2 = hidval2_ # [:,:,0::ss[0],0::ss[1]] + tctot += time.perf_counter() - time1 + + if conv_op_py: + time1 = time.perf_counter() + for i in range(repeat): + hidval3_ = propup3(imgval, w_flip) + hidval3 = hidval3_ # [:,:,0::ss[0],0::ss[1]] + tpytot += time.perf_counter() - time1 + assert (np.abs(hidval2 - hidval3) < 1e-5).all() + else: + tpytot += 0 + + if validate: + temp = np.abs(outval - hidval2) + assert (temp < 1e-5).all() + if validate and conv_op_py: + temp = np.abs(outval - hidval3) + assert (temp < 1e-5).all() + + imshp = tuple(outshp) + imgval = outval.reshape(bsize, outshp[0], outshp[1], outshp[2]) + + return tctot, tpytot, ntot + + +def exec_multilayer_conv_nnet( + conv_mode, + ss, + bsize, + imshp, + kshps, + nkerns, + unroll_batch=0, + unroll_kern=0, + img=None, + do_print=True, + repeat=1, + unroll_patch=False, + unroll_patch_size=False, + verbose=0, +): + if img is None: + img = dmatrix() + + # build actual input images + imgval = global_rng.random((bsize, imshp[0], imshp[1], imshp[2])) + + a = dmatrix() + kerns = [a for i in nkerns] + inputs4 = dmatrix4() + kerns4 = dmatrix4() + + # for each layer + ntot = 0 + tctot = 0 + tpytot = 0 + + for kshp, kern, nkern, n_layer in zip(kshps, kerns, nkerns, range(len(nkerns))): + if do_print: + print("************* layer %i ***************" % n_layer) + print(conv_mode, ss, n_layer, kshp, nkern) + + # actual values + w = global_rng.random(np.r_[nkern, imshp[0], kshp]) + w_flip = flip(w, kshp).reshape(w.shape) + + outshp = np.hstack( + (nkern, ConvOp.getOutputShape(imshp[1:], kshp, ss, conv_mode)) + ) + + time1 = time.perf_counter() + # outval = np.zeros(np.r_[bsize, outshp]) + + # ConvOp + if unroll_patch and not unroll_patch_size: + conv_op = ConvOp( + dx=ss[0], + dy=ss[1], + output_mode=conv_mode, + unroll_patch=unroll_patch, + verbose=verbose, + )(inputs4, kerns4) + else: + conv_op = ConvOp( + imshp, + kshp, + nkern, + bsize, + ss[0], + ss[1], + conv_mode, + unroll_batch=unroll_batch, + unroll_kern=unroll_kern, + unroll_patch=unroll_patch, + verbose=verbose, + )(inputs4, kerns4) + # l1shp = np.hstack((nkern, + # ConvOp.getOutputShape(imshp[1:], kshp, ss, conv_mode))) + propup2 = function([inputs4, kerns4], conv_op) + + time1 = time.perf_counter() + for i in range(repeat): + propup2(imgval, w_flip) + tctot += time.perf_counter() - time1 + + imshp = tuple(outshp) + # imgval = outval.reshape(bsize, outshp[0], outshp[1], outshp[2]) + + return tctot, tpytot, ntot + + +def speed_multilayer_conv(): + # calculate the speed up of different combination of unroll + # put the parameter to the same you will try. + # validate = False # we don't validate the result to have it much faster! + repeat = 3 + verbose = 1 + unroll_batch = [1, 2, 3, 4, 5, 6, 10] # 15, 30, 60 always much slower + unroll_kern = [1, 2, 3, 4, 5, 6, 10] # 15, 30, 60 always much slower + # unroll_batch = [1,4,5] + # unroll_kern = [1,4,5] + # unroll_batch = [1,4] + # unroll_kern = [1,4] + # unroll_patch = [True, False] + bsize = 60 # batch size + imshp_start = (1, 48, 48) # un square shape to test more corner case. + kshps = ([11, 12],) # un square shape to test more corner case. + nkerns = [60] # per output pixel + ssizes = [ + (1, 1), + ] # (1,1)]#(2,2) bugged + convmodes = ["valid", "full"] + # do_convolve2 = False + a = dmatrix() + kerns = [a for i in nkerns] + + assert len(kshps) == len(nkerns) == len(kerns) + timing = np.zeros( + (len(unroll_batch), len(unroll_kern), 3, len(convmodes) * len(ssizes)) + ) + t_b_k = [] + # calculate the timing with unrolling + + print("time unroll batch kern") + best = [] + worst = [] + t_ = [] + for unroll_b, n_b in zip(unroll_batch, range(len(unroll_batch))): + for unroll_k, n_k in zip(unroll_kern, range(len(unroll_kern))): + t_b_k.append(str(unroll_b) + "/" + str(unroll_k)) + if not t_: + tctot, tpytot, ntot = [], [], [] + for conv_mode, n_mode in zip(convmodes, range(len(convmodes))): + for ss, n_ss in zip(ssizes, range(len(ssizes))): + # tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet_old(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=unroll_b, unroll_kern=unroll_k, validate=validate, verbose=verbose,do_print=False) + tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet( + conv_mode, + ss, + bsize, + imshp_start, + kshps, + nkerns, + unroll_batch=unroll_b, + unroll_kern=unroll_k, + verbose=verbose, + do_print=False, + repeat=repeat, + ) + tctot += [tctot_] + tpytot += [tpytot_] + ntot += [ntot_] + if unroll_b == 4 and unroll_k == 4: + # print "unroll 4/4",tctot + best = tctot + if unroll_b == 1 and unroll_k == 1: + # print "unroll 1/1",tctot + worst = tctot + timing[n_b, n_k] = [ + tctot, + tpytot, + ntot, + ] # [sum(tctot), sum(tpytot), sum(ntot)] + if not t_: + t = timing[:, :, 0, :] # We select only the c timing. + else: + t = t_ + t = np.asarray(t) + # calculate the old timing + print("time old version") + tctot, tpytot, ntot = [], [], [] + tctot_ = [] + if not tctot_: + for conv_mode, n_mode in zip(convmodes, range(len(convmodes))): + for ss, n_ss in zip(ssizes, range(len(ssizes))): + # tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet_old(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate, verbose=verbose,do_print=False) + tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet( + conv_mode, + ss, + bsize, + imshp_start, + kshps, + nkerns, + unroll_batch=0, + unroll_kern=0, + verbose=verbose, + do_print=False, + repeat=repeat, + ) + tctot += [tctot_] + tpytot += [tpytot_] + ntot += [ntot_] + else: + tctot = np.asarray(tctot_) + print(f"old code timing {sum(tctot):.3f}s", tctot) + best = np.asarray(best) + worst = np.asarray(worst) + print("timing for unrolled version") + print("unroll_batch/unroll_kern valid_mode full_mode") + for n_b in range(len(unroll_batch)): + for n_k in range(len(unroll_kern)): + print((unroll_batch[n_b], unroll_kern[n_k]) + tuple(t[n_b, n_k]), ",") + # t_detail = t + t = t.sum(axis=2) + print( + f"max {t.max():.3f}s", + "max param(batch unloop size/kernel unloop size)", + t_b_k[t.argmax()], + ) + print( + f"min {t.min():.3f}s", + "min param(batch unloop size/kernel unloop size)", + t_b_k[t.argmin()], + ) + print( + f"speedup vs (1/1){t.max() / t.min():.3f}x, vs old {sum(tctot) / t.min():.3f}x" + ) + print(worst / best, tctot / best) + + # calculate the timing of unroll_patch + print("time unroll_patch") + tctot_patch = [] + tctot_patch_size = [] + for conv_mode, n_mode in zip(convmodes, range(len(convmodes))): + for ss, n_ss in zip(ssizes, range(len(ssizes))): + # tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet_old(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate,unroll_patch=True,verbose=verbose,do_print=False) + tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet( + conv_mode, + ss, + bsize, + imshp_start, + kshps, + nkerns, + unroll_batch=0, + unroll_kern=0, + unroll_patch=True, + verbose=verbose, + do_print=False, + repeat=repeat, + ) + tctot_patch += [tctot_] + # tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet_old(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate,unroll_patch=True,verbose=verbose,do_print=False,unroll_patch_size=True) + tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet( + conv_mode, + ss, + bsize, + imshp_start, + kshps, + nkerns, + unroll_batch=0, + unroll_kern=0, + unroll_patch=True, + verbose=verbose, + do_print=False, + unroll_patch_size=True, + repeat=repeat, + ) + tctot_patch_size += [tctot_] + + t_patch = sum(tctot_patch) + print("unroll_patch without shape time", tctot_patch) + print( + f"speedup vs (1/1){t.max() / t_patch:.3f}x, vs old {sum(tctot) / t_patch:.3f}x" + ) + print(best / tctot_patch, worst / tctot_patch) + t_patch_size = sum(tctot_patch_size) + print("unroll_patch with shape time", tctot_patch_size) + print( + "speedup vs (1/1)%.3fx, vs old %.3fx" + % (t.max() / t_patch_size, sum(tctot) / t_patch_size) + ) + print(best / tctot_patch_size, worst / tctot_patch_size) + return + + +if __name__ == "__main__": + speed_multilayer_conv() diff --git a/tests/tensor/conv/test_abstract_conv.py b/tests/tensor/conv/test_abstract_conv.py index 23ba23e1e9..a3e8dbeca9 100644 --- a/tests/tensor/conv/test_abstract_conv.py +++ b/tests/tensor/conv/test_abstract_conv.py @@ -5,8 +5,8 @@ import pytensor.tensor as pt from pytensor.compile.mode import Mode from pytensor.configdefaults import config -from pytensor.graph.rewriting.basic import check_stack_trace -from pytensor.tensor.conv import abstract_conv +from pytensor.tensor.conv import abstract_conv as conv +from pytensor.tensor.conv import conv2d_transpose, corr, corr3d from pytensor.tensor.conv.abstract_conv import ( AbstractConv2d, AbstractConv2d_gradInputs, @@ -18,13 +18,18 @@ bilinear_upsampling, causal_conv1d, check_conv_gradinputs_shape, - conv2d_transpose, get_conv_gradinputs_shape, get_conv_gradweights_shape, get_conv_output_shape, separable_conv2d, separable_conv3d, ) +from pytensor.tensor.conv.corr import CorrMM, CorrMM_gradInputs, CorrMM_gradWeights +from pytensor.tensor.conv.corr3d import ( + Corr3dMM, + Corr3dMMGradInputs, + Corr3dMMGradWeights, +) from pytensor.tensor.type import ( TensorType, ftensor4, @@ -35,7 +40,6 @@ tensor5, ) from tests import unittest_tools as utt -from tests.tensor.conv import c_conv3d_corr3d_ref, c_conv_corr_ref def conv2d_corr( @@ -48,9 +52,7 @@ def conv2d_corr( ): if conv_mode == "conv": filters = filters[:, :, ::-1, ::-1] - return c_conv_corr_ref.CorrMM(border_mode, subsample, filter_dilation)( - inputs, filters - ) + return corr.CorrMM(border_mode, subsample, filter_dilation)(inputs, filters) def conv2d_corr_gw( @@ -62,7 +64,7 @@ def conv2d_corr_gw( conv_mode="conv", filter_dilation=(1, 1), ): - rval = c_conv_corr_ref.CorrMM_gradWeights(border_mode, subsample, filter_dilation)( + rval = corr.CorrMM_gradWeights(border_mode, subsample, filter_dilation)( inputs, topgrad, filters_shape[2:] ) if conv_mode == "conv": @@ -81,7 +83,7 @@ def conv2d_corr_gi( ): if conv_mode == "conv": filters = filters[:, :, ::-1, ::-1] - return c_conv_corr_ref.CorrMM_gradInputs(border_mode, subsample, filter_dilation)( + return corr.CorrMM_gradInputs(border_mode, subsample, filter_dilation)( filters, topgrad, inputs_shape[2:] ) @@ -96,9 +98,7 @@ def conv3d_corr( ): if conv_mode == "conv": filters = filters[:, :, ::-1, ::-1, ::-1] - return c_conv3d_corr3d_ref.Corr3dMM(border_mode, subsample, filter_dilation)( - inputs, filters - ) + return corr3d.Corr3dMM(border_mode, subsample, filter_dilation)(inputs, filters) def conv3d_corr_gw( @@ -110,9 +110,9 @@ def conv3d_corr_gw( conv_mode="conv", filter_dilation=(1, 1, 1), ): - rval = c_conv3d_corr3d_ref.Corr3dMMGradWeights( - border_mode, subsample, filter_dilation - )(inputs, topgrad, filters_shape[2:]) + rval = corr3d.Corr3dMMGradWeights(border_mode, subsample, filter_dilation)( + inputs, topgrad, filters_shape[2:] + ) if conv_mode == "conv": rval = rval[:, :, ::-1, ::-1, ::-1] return rval @@ -129,9 +129,9 @@ def conv3d_corr_gi( ): if conv_mode == "conv": filters = filters[:, :, ::-1, ::-1, ::-1] - return c_conv3d_corr3d_ref.Corr3dMMGradInputs( - border_mode, subsample, filter_dilation - )(filters, topgrad, inputs_shape[2:]) + return corr3d.Corr3dMMGradInputs(border_mode, subsample, filter_dilation)( + filters, topgrad, inputs_shape[2:] + ) class TestGetConvOutShape: @@ -337,7 +337,7 @@ def test_shape_check_conv2d(self): input = tensor4() filters = tensor4() - out = abstract_conv.abstract_conv2d( + out = conv.abstract_conv2d( input, filters, input_shape=(3, 5, 7, 11), filter_shape=(7, 5, 3, 3) ) f = pytensor.function([input, filters], out) @@ -360,7 +360,7 @@ def test_shape_check_conv3d(self): input = tensor5() filters = tensor5() - out = abstract_conv.conv3d( + out = conv.conv3d( input, filters, input_shape=(3, 5, 7, 11, 13), filter_shape=(7, 5, 3, 3, 3) ) f = pytensor.function([input, filters], out) @@ -382,7 +382,7 @@ def test_shape_check_conv2d_grad_wrt_inputs(self): output_grad = tensor4() filters = tensor4() - out = abstract_conv.conv2d_grad_wrt_inputs( + out = conv.conv2d_grad_wrt_inputs( output_grad, filters, input_shape=(None, None, 7, 11), @@ -402,7 +402,7 @@ def test_shape_check_conv3d_grad_wrt_inputs(self): output_grad = tensor5() filters = tensor5() - out = abstract_conv.conv3d_grad_wrt_inputs( + out = conv.conv3d_grad_wrt_inputs( output_grad, filters, input_shape=(None, None, 7, 11, 13), @@ -421,7 +421,7 @@ def test_shape_check_conv2d_grad_wrt_weights(self): input = tensor4() output_grad = tensor4() - out = abstract_conv.conv2d_grad_wrt_weights( + out = conv.conv2d_grad_wrt_weights( input, output_grad, filter_shape=(None, None, 3, 3), @@ -441,7 +441,7 @@ def test_shape_check_conv3d_grad_wrt_weights(self): input = tensor5() output_grad = tensor5() - out = abstract_conv.conv3d_grad_wrt_weights( + out = conv.conv3d_grad_wrt_weights( input, output_grad, filter_shape=(None, None, 3, 3, 3), @@ -555,10 +555,10 @@ def run_fwd( f_ref = pytensor.function([], c_ref, mode="FAST_RUN") f = pytensor.function([], c, mode=mode) - if target_op is not None: - assert any(isinstance(n.op, target_op) for n in f.maker.fgraph.toposort()) - if check_trace: - assert check_stack_trace(f, ops_to_check=target_op) + # if target_op is not None: + # assert any(isinstance(n.op, target_op) for n in f.maker.fgraph.toposort()) + # if check_trace: + # assert check_stack_trace(f, ops_to_check=target_op) res_ref = np.array(f_ref()) res = np.array(f()) @@ -640,10 +640,10 @@ def run_gradweight( f = pytensor.function([], c, mode=mode) f_ref = pytensor.function([], c_ref, mode="FAST_RUN") - if target_op is not None: - assert any(isinstance(n.op, target_op) for n in f.maker.fgraph.toposort()) - if check_trace: - assert check_stack_trace(f, ops_to_check=target_op) + # if target_op is not None: + # assert any(isinstance(n.op, target_op) for n in f.maker.fgraph.toposort()) + # if check_trace: + # assert check_stack_trace(f, ops_to_check=target_op) res_ref = np.array(f_ref()) res = np.array(f()) @@ -725,10 +725,10 @@ def run_gradinput( ) f_ref = pytensor.function([], c_ref, mode="FAST_RUN") - if target_op is not None: - assert any(isinstance(n.op, target_op) for n in f.maker.fgraph.toposort()) - if check_trace: - assert check_stack_trace(f, ops_to_check=target_op) + # if target_op is not None: + # assert any(isinstance(n.op, target_op) for n in f.maker.fgraph.toposort()) + # if check_trace: + # assert check_stack_trace(f, ops_to_check=target_op) res = np.array(f()) @@ -897,8 +897,8 @@ def run_fwd( self, inputs_shape, filters_shape, - conv_fn=abstract_conv.abstract_conv2d, - conv_op=abstract_conv.AbstractConv2d, + conv_fn=conv.abstract_conv2d, + conv_op=conv.AbstractConv2d, ref=conv2d_corr, **kwargs, ): @@ -916,7 +916,7 @@ def run_gradweight( inputs_shape, filters_shape, output_shape, - gradWeights_fn=abstract_conv.AbstractConv2d_gradWeights, + gradWeights_fn=conv.AbstractConv2d_gradWeights, ref=conv2d_corr_gw, **kwargs, ): @@ -934,7 +934,7 @@ def run_gradinput( inputs_shape, filters_shape, output_shape, - gradInputs_fn=abstract_conv.AbstractConv2d_gradInputs, + gradInputs_fn=conv.AbstractConv2d_gradInputs, ref=conv2d_corr_gi, **kwargs, ): @@ -948,6 +948,96 @@ def run_gradinput( ) +@pytest.mark.skipif( + not config.cxx or config.mode == "FAST_COMPILE", + reason="Need blas to test conv2d", +) +class TestCorrConv2d(BaseTestConv2d): + @classmethod + def setup_class(cls): + # This tests can run even when config.blas__ldflags is empty. + super().setup_class() + + def run_test_case(self, i, f, s, b, flip, provide_shape, fd=(1, 1)): + o = self.get_output_shape(i, f, s, b, fd) + self.run_fwd( + inputs_shape=i, + filters_shape=f, + subsample=s, + verify_grad=True, + provide_shape=provide_shape, + border_mode=b, + filter_flip=flip, + target_op=CorrMM, + check_trace=True, + filter_dilation=fd, + ) + self.run_gradweight( + inputs_shape=i, + filters_shape=f, + output_shape=o, + subsample=s, + verify_grad=True, + provide_shape=provide_shape, + border_mode=b, + filter_flip=flip, + target_op=CorrMM_gradWeights, + check_trace=True, + filter_dilation=fd, + ) + self.run_gradinput( + inputs_shape=i, + filters_shape=f, + output_shape=o, + subsample=s, + verify_grad=True, + provide_shape=provide_shape, + border_mode=b, + filter_flip=flip, + target_op=CorrMM_gradInputs, + check_trace=True, + filter_dilation=fd, + ) + + def run_test_case_gi( + self, i, f, o, s, b, flip, provide_shape, fd=(1, 1), expect_error=False + ): + if not expect_error: + self.run_gradinput( + inputs_shape=i, + filters_shape=f, + output_shape=o, + subsample=s, + verify_grad=True, + provide_shape=provide_shape, + border_mode=b, + filter_flip=flip, + target_op=CorrMM_gradInputs, + check_trace=True, + filter_dilation=fd, + ) + else: + with pytest.raises(ValueError): + self.run_gradinput( + inputs_shape=i, + filters_shape=f, + output_shape=o, + subsample=s, + verify_grad=False, + provide_shape=provide_shape, + border_mode=b, + filter_flip=flip, + target_op=CorrMM_gradInputs, + ref=None, + check_trace=True, + filter_dilation=fd, + ) + + @pytest.mark.slow + def test_all(self): + super().test_all() + + @pytest.mark.skipif( config.cxx == "", reason="SciPy and cxx needed", @@ -1167,8 +1257,8 @@ def run_fwd( self, inputs_shape, filters_shape, - conv_fn=abstract_conv.conv3d, - conv_op=abstract_conv.AbstractConv3d, + conv_fn=conv.conv3d, + conv_op=conv.AbstractConv3d, ref=conv3d_corr, **kwargs, ): @@ -1186,7 +1276,7 @@ def run_gradweight( inputs_shape, filters_shape, output_shape, - gradWeights_fn=abstract_conv.AbstractConv3d_gradWeights, + gradWeights_fn=conv.AbstractConv3d_gradWeights, ref=conv3d_corr_gw, **kwargs, ): @@ -1204,7 +1294,7 @@ def run_gradinput( inputs_shape, filters_shape, output_shape, - gradInputs_fn=abstract_conv.AbstractConv3d_gradInputs, + gradInputs_fn=conv.AbstractConv3d_gradInputs, ref=conv3d_corr_gi, **kwargs, ): @@ -1218,6 +1308,94 @@ def run_gradinput( ) +@pytest.mark.skipif( + not config.cxx or config.mode == "FAST_COMPILE", + reason="Need blas to test conv3d", +) +class TestCorrConv3d(BaseTestConv3d): + @classmethod + def setup_class(cls): + # This tests can run even when config.blas__ldflags is empty. + super().setup_class() + + def run_test_case(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)): + o = self.get_output_shape(i, f, s, b, fd) + # This test can run even when config.blas__ldflags is empty. + self.run_fwd( + inputs_shape=i, + filters_shape=f, + subsample=s, + verify_grad=True, + provide_shape=provide_shape, + border_mode=b, + filter_flip=flip, + target_op=Corr3dMM, + check_trace=True, + filter_dilation=fd, + ) + self.run_gradweight( + inputs_shape=i, + filters_shape=f, + output_shape=o, + subsample=s, + verify_grad=True, + provide_shape=provide_shape, + border_mode=b, + filter_flip=flip, + target_op=Corr3dMMGradWeights, + check_trace=True, + filter_dilation=fd, + ) + self.run_gradinput( + inputs_shape=i, + filters_shape=f, + output_shape=o, + subsample=s, + verify_grad=True, + provide_shape=provide_shape, + border_mode=b, + filter_flip=flip, + target_op=Corr3dMMGradInputs, + check_trace=True, + filter_dilation=fd, + ) + + def run_test_case_gi( + self, i, f, o, s, b, flip, provide_shape, fd=(1, 1, 1), expect_error=False + ): + # This test can run even when config.blas__ldflags is empty. + if not expect_error: + self.run_gradinput( + inputs_shape=i, + filters_shape=f, + output_shape=o, + subsample=s, + verify_grad=True, + provide_shape=provide_shape, + border_mode=b, + filter_flip=flip, + target_op=Corr3dMMGradInputs, + check_trace=True, + filter_dilation=fd, + ) + else: + with pytest.raises(ValueError): + self.run_gradinput( + inputs_shape=i, + filters_shape=f, + output_shape=o, + subsample=s, + verify_grad=False, + provide_shape=provide_shape, + border_mode=b, + filter_flip=flip, + target_op=Corr3dMMGradInputs, + ref=None, + check_trace=True, + filter_dilation=fd, + ) + + def test_constant_shapes(): # Check that the `imshp` and `kshp` parameters of the AbstractConv Ops # are rejected if not constant or None @@ -1278,7 +1456,7 @@ def test_grad_types(self): out_shape = lvector() - output = abstract_conv.abstract_conv2d(input, filters) + output = conv.abstract_conv2d(input, filters) grad_input, grad_filters = pytensor.grad(output.sum(), wrt=(input, filters)) assert grad_input.type == input.type, ( grad_input, @@ -1293,9 +1471,7 @@ def test_grad_types(self): filters.type, ) - grad_filters = abstract_conv.AbstractConv2d_gradWeights()( - input, topgrad, out_shape - ) + grad_filters = conv.AbstractConv2d_gradWeights()(input, topgrad, out_shape) grad_input, grad_topgrad = pytensor.grad( grad_filters.sum(), wrt=(input, topgrad) ) @@ -1313,9 +1489,7 @@ def test_grad_types(self): topgrad.type, ) - grad_input = abstract_conv.AbstractConv2d_gradInputs()( - filters, topgrad, out_shape - ) + grad_input = conv.AbstractConv2d_gradInputs()(filters, topgrad, out_shape) grad_filters, grad_topgrad = pytensor.grad( grad_input.sum(), wrt=(filters, topgrad) ) @@ -1342,7 +1516,7 @@ def test_constant_input(self): out_shape = lvector() # Check the forward Op - output = abstract_conv.abstract_conv2d(constant_tensor, filters) + output = conv.abstract_conv2d(constant_tensor, filters) grad_filters = pytensor.grad(output.sum(), wrt=filters) assert filters.type.is_super(grad_filters.type), ( grad_filters, @@ -1351,7 +1525,7 @@ def test_constant_input(self): filters.type, ) - output = abstract_conv.abstract_conv2d(input, constant_tensor) + output = conv.abstract_conv2d(input, constant_tensor) grad_input = pytensor.grad(output.sum(), wrt=input) assert input.type.is_super(grad_input.type), ( grad_input, @@ -1361,7 +1535,7 @@ def test_constant_input(self): ) # Check grad wrt weights - grad_filters = abstract_conv.AbstractConv2d_gradWeights()( + grad_filters = conv.AbstractConv2d_gradWeights()( constant_tensor, topgrad, out_shape ) grad_topgrad = pytensor.grad(grad_filters.sum(), wrt=topgrad) @@ -1372,7 +1546,7 @@ def test_constant_input(self): topgrad.type, ) - grad_filters = abstract_conv.AbstractConv2d_gradWeights()( + grad_filters = conv.AbstractConv2d_gradWeights()( input, constant_tensor, out_shape ) grad_input = pytensor.grad(grad_filters.sum(), wrt=input) @@ -1384,7 +1558,7 @@ def test_constant_input(self): ) # Check grad wrt inputs - grad_input = abstract_conv.AbstractConv2d_gradInputs()( + grad_input = conv.AbstractConv2d_gradInputs()( constant_tensor, topgrad, out_shape ) grad_topgrad = pytensor.grad(grad_input.sum(), wrt=topgrad) @@ -1395,7 +1569,7 @@ def test_constant_input(self): topgrad.type, ) - grad_input = abstract_conv.AbstractConv2d_gradInputs()( + grad_input = conv.AbstractConv2d_gradInputs()( filters, constant_tensor, out_shape ) grad_filters = pytensor.grad(grad_input.sum(), wrt=filters) @@ -1775,16 +1949,18 @@ def test_conv2d_grad_wrt_inputs(self): filter_val = self.random_stream.random(fltr_shape).astype( config.floatX ) - out_grad_shape = abstract_conv.get_conv_output_shape( - image_shape=in_shape, - kernel_shape=fltr_shape, - border_mode=bm, - subsample=ss, + out_grad_shape = ( + pytensor.tensor.conv.abstract_conv.get_conv_output_shape( + image_shape=in_shape, + kernel_shape=fltr_shape, + border_mode=bm, + subsample=ss, + ) ) out_grad_val = self.random_stream.random(out_grad_shape).astype( config.floatX ) - conv_out = abstract_conv.conv2d( + conv_out = pytensor.tensor.conv.conv2d( self.x, filters=self.w, border_mode=bm, @@ -1802,14 +1978,16 @@ def test_conv2d_grad_wrt_inputs(self): [self.x, self.w, self.output_grad], conv_grad ) - conv_wrt_i_out = abstract_conv.conv2d_grad_wrt_inputs( - output_grad=self.output_grad_wrt, - filters=self.w, - border_mode=bm, - subsample=ss, - input_shape=in_shape, - filter_shape=fltr_shape, - filter_flip=ff, + conv_wrt_i_out = ( + pytensor.tensor.conv.abstract_conv.conv2d_grad_wrt_inputs( + output_grad=self.output_grad_wrt, + filters=self.w, + border_mode=bm, + subsample=ss, + input_shape=in_shape, + filter_shape=fltr_shape, + filter_flip=ff, + ) ) f_new = pytensor.function( [self.w, self.output_grad_wrt], conv_wrt_i_out @@ -1824,7 +2002,7 @@ def test_conv2d_grad_wrt_inputs(self): def test_conv2d_grad_wrt_weights(self): # Compares calculated abstract grads wrt weights with the fwd grads # This method checks the outputs of `conv2_grad_wrt_weights` against - # the outputs of `pytensor.tensor.conv` forward grads to make sure the + # the outputs of `pytensor.tensor.conv.conv` forward grads to make sure the # results are the same. for in_shape, fltr_shape in zip( @@ -1839,16 +2017,18 @@ def test_conv2d_grad_wrt_weights(self): filter_val = self.random_stream.random(fltr_shape).astype( config.floatX ) - out_grad_shape = abstract_conv.get_conv_output_shape( - image_shape=in_shape, - kernel_shape=fltr_shape, - border_mode=bm, - subsample=ss, + out_grad_shape = ( + pytensor.tensor.conv.abstract_conv.get_conv_output_shape( + image_shape=in_shape, + kernel_shape=fltr_shape, + border_mode=bm, + subsample=ss, + ) ) out_grad_val = self.random_stream.random(out_grad_shape).astype( config.floatX ) - conv_out = abstract_conv.conv2d( + conv_out = pytensor.tensor.conv.conv2d( self.x, filters=self.w, border_mode=bm, @@ -1866,14 +2046,16 @@ def test_conv2d_grad_wrt_weights(self): [self.x, self.w, self.output_grad], conv_grad ) - conv_wrt_w_out = abstract_conv.conv2d_grad_wrt_weights( - self.x, - output_grad=self.output_grad_wrt, - border_mode=bm, - subsample=ss, - input_shape=in_shape, - filter_shape=fltr_shape, - filter_flip=ff, + conv_wrt_w_out = ( + pytensor.tensor.conv.abstract_conv.conv2d_grad_wrt_weights( + self.x, + output_grad=self.output_grad_wrt, + border_mode=bm, + subsample=ss, + input_shape=in_shape, + filter_shape=fltr_shape, + filter_flip=ff, + ) ) f_new = pytensor.function( [self.x, self.output_grad_wrt], conv_wrt_w_out @@ -1889,12 +2071,12 @@ def test_conv2d_grad_wrt_weights(self): reason="SciPy and cxx needed", ) class TestGroupedConvNoOptim: - conv = abstract_conv.AbstractConv2d - conv_gradw = abstract_conv.AbstractConv2d_gradWeights - conv_gradi = abstract_conv.AbstractConv2d_gradInputs - conv_op = abstract_conv.AbstractConv2d - conv_gradw_op = abstract_conv.AbstractConv2d_gradWeights - conv_gradi_op = abstract_conv.AbstractConv2d_gradInputs + conv = pytensor.tensor.conv.abstract_conv.AbstractConv2d + conv_gradw = pytensor.tensor.conv.abstract_conv.AbstractConv2d_gradWeights + conv_gradi = pytensor.tensor.conv.abstract_conv.AbstractConv2d_gradInputs + conv_op = pytensor.tensor.conv.abstract_conv.AbstractConv2d + conv_gradw_op = pytensor.tensor.conv.abstract_conv.AbstractConv2d_gradWeights + conv_gradi_op = pytensor.tensor.conv.abstract_conv.AbstractConv2d_gradInputs mode = Mode(optimizer=None) is_dnn = False @@ -2101,12 +2283,12 @@ def conv_gradinputs(filters_val, output_val): reason="SciPy and cxx needed", ) class TestGroupedConv3dNoOptim(TestGroupedConvNoOptim): - conv = abstract_conv.AbstractConv3d - conv_gradw = abstract_conv.AbstractConv3d_gradWeights - conv_gradi = abstract_conv.AbstractConv3d_gradInputs - conv_op = abstract_conv.AbstractConv3d - conv_gradw_op = abstract_conv.AbstractConv3d_gradWeights - conv_gradi_op = abstract_conv.AbstractConv3d_gradInputs + conv = pytensor.tensor.conv.abstract_conv.AbstractConv3d + conv_gradw = pytensor.tensor.conv.abstract_conv.AbstractConv3d_gradWeights + conv_gradi = pytensor.tensor.conv.abstract_conv.AbstractConv3d_gradInputs + conv_op = pytensor.tensor.conv.abstract_conv.AbstractConv3d + conv_gradw_op = pytensor.tensor.conv.abstract_conv.AbstractConv3d_gradWeights + conv_gradi_op = pytensor.tensor.conv.abstract_conv.AbstractConv3d_gradInputs mode = Mode(optimizer=None) def setup_method(self): @@ -2340,12 +2522,12 @@ def test_interface3d(self): reason="SciPy and cxx needed", ) class TestUnsharedConv: - conv2d = abstract_conv.AbstractConv2d - conv2d_gradw = abstract_conv.AbstractConv2d_gradWeights - conv2d_gradi = abstract_conv.AbstractConv2d_gradInputs - conv2d_op = abstract_conv.AbstractConv2d - conv2d_gradw_op = abstract_conv.AbstractConv2d_gradWeights - conv2d_gradi_op = abstract_conv.AbstractConv2d_gradInputs + conv2d = pytensor.tensor.conv.abstract_conv.AbstractConv2d + conv2d_gradw = pytensor.tensor.conv.abstract_conv.AbstractConv2d_gradWeights + conv2d_gradi = pytensor.tensor.conv.abstract_conv.AbstractConv2d_gradInputs + conv2d_op = pytensor.tensor.conv.abstract_conv.AbstractConv2d + conv2d_gradw_op = pytensor.tensor.conv.abstract_conv.AbstractConv2d_gradWeights + conv2d_gradi_op = pytensor.tensor.conv.abstract_conv.AbstractConv2d_gradInputs mode = Mode(optimizer="None") @@ -2571,12 +2753,12 @@ def conv_gradinputs(filters_val, output_val): class TestAsymmetricPadding: - conv2d = abstract_conv.AbstractConv2d - conv2d_gradw = abstract_conv.AbstractConv2d_gradWeights - conv2d_gradi = abstract_conv.AbstractConv2d_gradInputs - conv2d_op = abstract_conv.AbstractConv2d - conv2d_gradw_op = abstract_conv.AbstractConv2d_gradWeights - conv2d_gradi_op = abstract_conv.AbstractConv2d_gradInputs + conv2d = pytensor.tensor.conv.abstract_conv.AbstractConv2d + conv2d_gradw = pytensor.tensor.conv.abstract_conv.AbstractConv2d_gradWeights + conv2d_gradi = pytensor.tensor.conv.abstract_conv.AbstractConv2d_gradInputs + conv2d_op = pytensor.tensor.conv.abstract_conv.AbstractConv2d + conv2d_gradw_op = pytensor.tensor.conv.abstract_conv.AbstractConv2d_gradWeights + conv2d_gradi_op = pytensor.tensor.conv.abstract_conv.AbstractConv2d_gradInputs mode = Mode(optimizer="None") diff --git a/tests/tensor/conv/test_conv.py b/tests/tensor/conv/test_conv.py new file mode 100644 index 0000000000..a03ca00c50 --- /dev/null +++ b/tests/tensor/conv/test_conv.py @@ -0,0 +1,783 @@ +import time + +import numpy as np +import pytest + +import pytensor +import pytensor.tensor as at +from pytensor.compile.mode import Mode +from pytensor.tensor.conv import conv, conv2d +from pytensor.tensor.exceptions import NotScalarConstantError +from pytensor.tensor.math import _allclose, exp +from pytensor.tensor.type import dmatrix, dtensor3, dtensor4, dvector, scalar, tensor4 +from tests import unittest_tools as utt + + +@pytest.mark.skipif( + pytensor.config.cxx == "", + reason="conv2d tests need SciPy or a c++ compiler", +) +class TestConv2D(utt.InferShapeTester): + # This class contains tests for the legacy 2d convolution, + # but will also be inherited from for other implementations + mode = None + dtype = pytensor.config.floatX + # This will be set to the appropriate function in the inherited classes. + # The call to `staticmethod` is necessary to prevent Python from passing + # `self` as the first argument. + conv2d = staticmethod(conv2d) + + def setup_method(self): + self.input = tensor4("input", dtype=self.dtype) + self.input.name = "default_V" + self.filters = tensor4("filters", dtype=self.dtype) + self.filters.name = "default_filters" + super().setup_method() + + def validate( + self, + image_shape, + filter_shape, + border_mode="valid", + subsample=(1, 1), + N_image_shape=None, + N_filter_shape=None, + input=None, + filters=None, + unroll_batch=None, + unroll_kern=None, + unroll_patch=None, + verify_grad=True, + should_raise=False, + ): + """ + :param image_shape: The constant shape info passed to conv2d. + :param filter_shape: The constant shape info passed to conv2d. + + :param N_image_shape: None(default to image_shape) or tuple of + 4 elements with the shape of the input image + + :param N_filter_shape: None(default to filter_shape) or tuple + of 4 elements with the shape of the + input filter + + """ + if N_image_shape is None: + N_image_shape = [ + at.get_scalar_constant_value(at.as_tensor_variable(x)) + for x in image_shape + ] + if N_filter_shape is None: + N_filter_shape = [ + at.get_scalar_constant_value(at.as_tensor_variable(x)) + for x in filter_shape + ] + + if input is None: + input = self.input + if not filters: + filters = self.filters + + # PYTENSOR IMPLEMENTATION + + # we create a symbolic function so that verify_grad can work + def sym_conv2d(input, filters): + # define pytensor graph and function + input.name = "input" + filters.name = "filters" + with pytest.warns(DeprecationWarning): + rval = conv.conv2d( + input, + filters, + image_shape, + filter_shape, + border_mode, + subsample, + unroll_batch=unroll_batch, + unroll_kern=unroll_kern, + unroll_patch=unroll_patch, + ) + rval.name = "conv_output" + return rval + + output = sym_conv2d(input, filters) + output.name = f"conv2d({input.name},{filters.name})" + pytensor_conv = pytensor.function([input, filters], output, mode=self.mode) + + # initialize input and compute result + image_data = np.random.random(N_image_shape).astype(self.dtype) + filter_data = np.random.random(N_filter_shape).astype(self.dtype) + try: + pytensor_output = pytensor_conv(image_data, filter_data) + except ValueError: + if not should_raise: + raise + return + else: + if should_raise: + raise Exception("ConvOp should have generated an error") + + # REFERENCE IMPLEMENTATION + s = 1.0 + orig_image_data = image_data + if border_mode != "full": + s = -1.0 + out_shape2d = ( + np.array(N_image_shape[-2:]) + s * np.array(N_filter_shape[-2:]) - s + ) + out_shape2d = np.ceil(out_shape2d / np.array(subsample)) + # avoid numpy deprecation + out_shape2d = out_shape2d.astype("int32") + out_shape = (N_image_shape[0], N_filter_shape[0]) + tuple(out_shape2d) + ref_output = np.zeros(out_shape) + + # loop over output feature maps + ref_output.fill(0) + if border_mode == "full": + image_data2 = np.zeros( + ( + N_image_shape[0], + N_image_shape[1], + N_image_shape[2] + 2 * N_filter_shape[2] - 2, + N_image_shape[3] + 2 * N_filter_shape[3] - 2, + ) + ) + image_data2[ + :, + :, + N_filter_shape[2] - 1 : N_filter_shape[2] - 1 + N_image_shape[2], + N_filter_shape[3] - 1 : N_filter_shape[3] - 1 + N_image_shape[3], + ] = image_data + image_data = image_data2 + N_image_shape = image_data.shape + for bb in range(N_image_shape[0]): + for nn in range(N_filter_shape[0]): + for im0 in range(N_image_shape[1]): + filter2d = filter_data[nn, im0, :, :] + image2d = image_data[bb, im0, :, :] + for row in range(ref_output.shape[2]): + irow = row * subsample[0] # image row + for col in range(ref_output.shape[3]): + icol = col * subsample[1] # image col + ref_output[bb, nn, row, col] += ( + image2d[ + irow : irow + N_filter_shape[2], + icol : icol + N_filter_shape[3], + ] + * filter2d[::-1, ::-1] + ).sum() + + assert _allclose(pytensor_output, ref_output) + + # TEST GRADIENT + if verify_grad: + utt.verify_grad(sym_conv2d, [orig_image_data, filter_data]) + + def test_basic1(self): + # Tests that basic convolutions work for odd and even + # dimensions of image and filter shapes, as well as rectangular + # images and filters. + + self.validate((2, 2, 3, 3), (2, 2, 2, 2), "valid", verify_grad=False) + + def test_basic(self): + # Tests that basic convolutions work for odd and even + # dimensions of image and filter shapes, as well as rectangular + # images and filters. + + self.validate((3, 2, 8, 8), (4, 2, 5, 5), "valid", verify_grad=False) + self.validate((3, 2, 7, 5), (5, 2, 2, 3), "valid") + self.validate((3, 2, 7, 5), (5, 2, 3, 2), "valid", verify_grad=False) + self.validate((3, 2, 8, 8), (4, 2, 5, 5), "full", verify_grad=False) + self.validate((3, 2, 7, 5), (5, 2, 2, 3), "full") + # test filter same size as input + + def test_uint_image_shape_datatype(self): + # Tests for uint datatype in image_shape. + + self.validate((2, 2, 3, np.uint8(3)), (3, 2, 3, 3), "valid", verify_grad=False) + self.validate((np.uint16(2), 2, 3, 3), (3, 2, 3, 3), "valid", verify_grad=False) + self.validate((2, np.uint32(2), 3, 3), (3, 2, 3, 3), "valid", verify_grad=False) + + def test_uint_filter_shape_datatype(self): + # Tests for uint datatype in filter_shape + + self.validate((3, 2, 3, 3), (2, 2, 3, np.uint8(3)), "valid", verify_grad=False) + self.validate((3, 2, 3, 3), (np.uint16(2), 2, 3, 3), "valid", verify_grad=False) + self.validate((3, 2, 3, 3), (2, np.uint32(2), 3, 3), "valid", verify_grad=False) + + def test_img_kernel_same_shape(self): + self.validate((3, 2, 3, 3), (4, 2, 3, 3), "full") + self.validate((3, 2, 3, 3), (4, 2, 3, 3), "valid") + + def test_unroll_patch_true(self): + # Test basic convs with True. + + self.validate((3, 2, 7, 5), (5, 2, 2, 3), "valid", unroll_patch=True) + self.validate((3, 2, 7, 5), (5, 2, 2, 3), "full", unroll_patch=True) + self.validate( + (3, 2, 3, 3), (4, 2, 3, 3), "valid", unroll_patch=True, verify_grad=False + ) + + def test_unroll_patch_false(self): + # Test basic convs with False. + + self.validate((3, 2, 7, 5), (5, 2, 2, 3), "valid", unroll_patch=False) + self.validate((3, 2, 7, 5), (5, 2, 2, 3), "full", unroll_patch=False) + self.validate( + (3, 2, 3, 3), (4, 2, 3, 3), "valid", unroll_patch=False, verify_grad=False + ) + + def test_unroll_patch_true_fail(self): + # Test basic convs with True. + + self.validate( + (3, 2, 7, 5), + (5, 2, 2, 3), + "valid", + unroll_patch=True, + N_image_shape=(1, 3, 3, 3), + N_filter_shape=(6, 3, 2, 2), + should_raise=True, + ) + self.validate( + (3, 2, 7, 5), + (5, 2, 2, 3), + "full", + unroll_patch=True, + N_image_shape=(1, 3, 3, 3), + N_filter_shape=(6, 3, 2, 2), + should_raise=True, + ) + self.validate( + (3, 2, 3, 3), + (4, 2, 3, 3), + "valid", + unroll_patch=True, + N_image_shape=(1, 3, 3, 3), + N_filter_shape=(6, 3, 2, 2), + should_raise=True, + ) + + def test_unroll_special(self): + # (unroll_kern, unroll_batch) in (0,1),(1,0) is special case. + + self.validate((6, 2, 3, 3), (3, 2, 2, 2), "valid", unroll_batch=1) + + def test_unroll_batch(self): + # Test mini-batch unrolling for various legal values. + + # mini-batch of size 6 is multiple of 2 and 3. Should work. + self.validate( + (6, 2, 3, 3), (3, 2, 2, 2), "valid", unroll_batch=2, verify_grad=False + ) + self.validate( + (6, 2, 3, 3), (3, 2, 2, 2), "valid", unroll_batch=3, verify_grad=False + ) + + def test_unroll_kern(self): + # Test kernel unrolling for various legal values. + + # 6 filters is a multiple of 2 and 3. Should work. + self.validate( + (2, 3, 3, 3), (6, 3, 2, 2), "valid", unroll_kern=2, verify_grad=False + ) + self.validate( + (2, 3, 3, 3), (6, 3, 2, 2), "valid", unroll_kern=3, verify_grad=False + ) + + def test_unroll_batch_kern(self): + # Test mini-batch unrolling with kernel unrolling for various + # legal values. + + # mini-batch of size 6 is multiple of 2 and 3. Should work. + self.validate( + (6, 2, 3, 3), + (3, 2, 2, 2), + "valid", + unroll_batch=2, + unroll_kern=3, + verify_grad=False, + ) + self.validate( + (6, 2, 3, 3), + (3, 2, 2, 2), + "valid", + unroll_batch=3, + unroll_kern=3, + verify_grad=False, + ) + # 6 filters is a multiple of 2 and 3. Should work. + self.validate( + (2, 3, 3, 3), + (6, 3, 2, 2), + "valid", + unroll_batch=2, + unroll_kern=2, + verify_grad=False, + ) + self.validate( + (2, 3, 3, 3), + (6, 3, 2, 2), + "valid", + unroll_batch=2, + unroll_kern=3, + verify_grad=False, + ) + + def test_unroll_batch_kern_fail(self): + # Test mini-batch unrolling with kernel unrolling for various + # legal values, but pass bad input. All those test must + # generate errors + + # mini-batch of size 6 is multiple of 2 and 3. Should work. + self.validate( + (6, 2, 3, 3), + (3, 2, 2, 2), + "valid", + unroll_batch=2, + unroll_kern=3, + N_image_shape=(7, 2, 3, 3), + N_filter_shape=(3, 2, 2, 2), + should_raise=True, + ) + self.validate( + (6, 2, 3, 3), + (3, 2, 2, 2), + "valid", + unroll_batch=3, + unroll_kern=3, + N_image_shape=(6, 2, 3, 3), + N_filter_shape=(4, 2, 2, 2), + should_raise=True, + ) + self.validate( + (2, 3, 3, 3), + (6, 3, 2, 2), + "valid", + unroll_batch=2, + unroll_kern=2, + N_image_shape=(1, 3, 3, 3), + N_filter_shape=(6, 3, 2, 2), + should_raise=True, + ) + self.validate( + (2, 3, 3, 3), + (6, 3, 2, 2), + "valid", + unroll_batch=2, + unroll_kern=3, + N_image_shape=(2, 3, 3, 3), + N_filter_shape=(5, 3, 2, 2), + should_raise=True, + ) + + def test_subsample(self): + # Tests convolution where subsampling != (1,1) + self.validate((3, 2, 7, 5), (5, 2, 2, 3), "full", subsample=(2, 2)) + + # Fails as of 2012-07-11 + with pytest.raises(NotImplementedError): + self.validate((1, 1, 6, 6), (1, 1, 3, 3), "full", subsample=(3, 3)) + + # Fails as of 2017-08-10 + with pytest.raises(NotImplementedError): + self.validate((3, 2, 7, 5), (5, 2, 2, 3), "valid", subsample=(2, 2)) + with pytest.raises(NotImplementedError): + self.validate((3, 2, 7, 5), (5, 2, 2, 3), "valid", subsample=(2, 1)) + with pytest.raises(NotImplementedError): + self.validate((1, 1, 6, 6), (1, 1, 3, 3), "valid", subsample=(3, 3)) + + def test_shape_Constant_tensor(self): + # Tests convolution where the {image,filter}_shape is a Constant tensor. + + as_t = at.as_tensor_variable + self.validate((as_t(3), as_t(2), as_t(7), as_t(5)), (5, 2, 2, 3), "valid") + self.validate(as_t([3, 2, 7, 5]), (5, 2, 2, 3), "valid") + self.validate(as_t((3, 2, 7, 5)), (5, 2, 2, 3), "valid") + self.validate((3, 2, 7, 5), (as_t(5), as_t(2), as_t(2), as_t(3)), "valid") + self.validate((3, 2, 7, 5), as_t([5, 2, 2, 3]), "valid") + self.validate((3, 2, 7, 5), as_t((5, 2, 2, 3)), "valid") + self.validate(as_t([3, 2, 7, 5]), as_t([5, 2, 2, 3]), "full") + + def test_invalid_filter_shape(self): + # Tests scenario where filter_shape[1] != input_shape[1] + + with pytest.raises(AssertionError): + self.validate((3, 2, 8, 8), (4, 3, 5, 5), "valid") + + def test_invalid_input_shape(self): + # Tests that when the shape given at build time is not the same as + # run time we raise an error + + for unroll_batch in [None, 1, 3]: + for unroll_kern in [None, 2, 4]: + for unroll_patch in [None, True, False]: + for mode in ["valid", "full"]: + with pytest.raises(ValueError): + self.validate( + (3, 2, 8, 8), + (4, 2, 5, 5), + mode, + N_image_shape=(2, 2, 8, 8), + unroll_batch=unroll_batch, + unroll_kern=unroll_kern, + unroll_patch=unroll_patch, + ) + with pytest.raises(ValueError): + self.validate( + (3, 2, 8, 8), + (4, 2, 5, 5), + mode, + N_image_shape=(3, 1, 8, 8), + unroll_batch=unroll_batch, + unroll_kern=unroll_kern, + unroll_patch=unroll_patch, + ) + with pytest.raises(ValueError): + self.validate( + (3, 2, 8, 8), + (4, 2, 5, 5), + mode, + N_image_shape=(3, 2, 7, 8), + unroll_batch=unroll_batch, + unroll_kern=unroll_kern, + unroll_patch=unroll_patch, + ) + with pytest.raises(ValueError): + self.validate( + (3, 2, 8, 8), + (4, 2, 5, 5), + mode, + N_image_shape=(3, 2, 8, 7), + unroll_batch=unroll_batch, + unroll_kern=unroll_kern, + unroll_patch=unroll_patch, + ) + + with pytest.raises(ValueError): + self.validate( + (3, 2, 8, 8), + (4, 2, 5, 5), + mode, + N_filter_shape=(3, 2, 5, 5), + unroll_batch=unroll_batch, + unroll_kern=unroll_kern, + unroll_patch=unroll_patch, + ) + with pytest.raises(ValueError): + self.validate( + (3, 2, 8, 8), + (4, 2, 5, 5), + mode, + N_filter_shape=(4, 1, 5, 5), + unroll_batch=unroll_batch, + unroll_kern=unroll_kern, + unroll_patch=unroll_patch, + ) + with pytest.raises(ValueError): + self.validate( + (3, 2, 8, 8), + (4, 2, 5, 5), + mode, + N_filter_shape=(4, 2, 6, 5), + unroll_batch=unroll_batch, + unroll_kern=unroll_kern, + unroll_patch=unroll_patch, + ) + with pytest.raises(ValueError): + self.validate( + (3, 2, 8, 8), + (4, 2, 5, 5), + mode, + N_filter_shape=(4, 2, 5, 6), + unroll_batch=unroll_batch, + unroll_kern=unroll_kern, + unroll_patch=unroll_patch, + ) + + def test_missing_info(self): + # Test convolutions for various pieces of missing info. + + self.validate( + None, None, N_image_shape=(3, 2, 8, 8), N_filter_shape=(4, 2, 5, 5) + ) + self.validate( + (3, 2, None, None), + None, + N_image_shape=(3, 2, 8, 8), + N_filter_shape=(4, 2, 5, 5), + ) + self.validate( + (None, 2, None, None), + (None, 2, 5, 5), + N_image_shape=(3, 2, 8, 8), + N_filter_shape=(4, 2, 5, 5), + ) + self.validate( + (3, 2, 8, 8), + (4, 2, None, 5), + N_image_shape=(3, 2, 8, 8), + N_filter_shape=(4, 2, 5, 5), + ) + self.validate( + (3, 2, 8, 8), + (4, 2, 5, None), + N_image_shape=(3, 2, 8, 8), + N_filter_shape=(4, 2, 5, 5), + ) + + def test_wrong_info(self): + # Test convolutions when we don't give a constant as shape information + + i = pytensor.scalar.basic.int32() + with pytest.raises(NotScalarConstantError): + self.validate( + (3, 2, 8, i), + (4, 2, 5, 5), + N_image_shape=(3, 2, 8, 8), + N_filter_shape=(4, 2, 5, 5), + ) + with pytest.raises(NotScalarConstantError): + self.validate( + (3, 2, 8, 8), + (4, 2, 5, i), + N_image_shape=(3, 2, 8, 8), + N_filter_shape=(4, 2, 5, 5), + ) + + def test_full_mode(self): + # Tests basic convolution in full mode and case where filter + # is larger than the input image. + + self.validate((3, 2, 5, 5), (4, 2, 8, 8), "full") + + def f(): + self.validate((3, 2, 5, 5), (4, 2, 8, 8), "valid") + + with pytest.raises(Exception): + f() + + def test_wrong_input(self): + # Make sure errors are raised when image and kernel are not 4D tensors + + with pytest.raises(Exception): + self.validate((3, 2, 8, 8), (4, 2, 5, 5), "valid", input=dmatrix()) + with pytest.raises(Exception): + self.validate((3, 2, 8, 8), (4, 2, 5, 5), "valid", filters=dvector()) + with pytest.raises(Exception): + self.validate((3, 2, 8, 8), (4, 2, 5, 5), "valid", input=dtensor3()) + + def test_gcc_crash(self): + # gcc 4.3.0 20080428 (Red Hat 4.3.0-8) + # + # crashed in this following case. I changed the c code to don't hit + # gcc bug. So it should not crash anymore + + self.validate((1, 10, 213, 129), (46, 10, 212, 1), "valid", verify_grad=False) + + def speed(self): + n_calls = 20000 + print("n_calls", n_calls) + for border_mode in ["valid", "full"]: + print() + print(border_mode) + for openmp in [False, True]: + print("OpenMP", openmp) + image_shapes = [ + (1, 5, 6, 6), + (10, 5, 6, 6) + # (10, 10, 16, 16), + # (10, 10, 32, 32)] + ] + print("image_shape", image_shapes) + for image_shape in image_shapes: + filter_shapes = [(1, 5, 4, 4), (2, 5, 4, 4), (5, 5, 4, 4)] + print("filter_shapes", filter_shapes) + for filter_shape in filter_shapes: + input = pytensor.shared(np.random.random(image_shape)) + filters = pytensor.shared(np.random.random(filter_shape)) + + with pytest.warns(DeprecationWarning): + output = conv.conv2d( + input, + filters, + image_shape, + filter_shape, + border_mode, + unroll_patch=True, + openmp=openmp, + ) + mode = Mode( + linker=pytensor.link.vm.VMLinker( + allow_gc=False, use_cloop=True + ) + ) + pytensor_conv = pytensor.function([], output, mode=mode) + t1 = time.perf_counter() + pytensor_conv.vm(n_calls=n_calls) + t2 = time.perf_counter() + print(t2 - t1, end=" ") + print() + + def test_infer_shape(self): + # Note: infer_shape is incomplete and thus input and filter shapes + # must be provided explicitly + + rng = np.random.default_rng(280284) + + def rand(*shape): + r = np.asarray(rng.random(shape), dtype="float64") + return r * 2 - 1 + + adtens = dtensor4() + bdtens = dtensor4() + aivec_val = [4, 5, 6, 3] + bivec_val = [7, 5, 3, 2] + adtens_val = rand(*aivec_val) + bdtens_val = rand(*bivec_val) + with pytest.warns(DeprecationWarning): + self._compile_and_check( + [adtens, bdtens], + [ + conv.conv2d( + adtens, bdtens, aivec_val, bivec_val, border_mode="valid" + ) + ], + [adtens_val, bdtens_val], + conv.ConvOp, + excluding=["conv_gemm"], + ) + + with pytest.warns(DeprecationWarning): + self._compile_and_check( + [adtens, bdtens], + [conv.conv2d(adtens, bdtens, aivec_val, bivec_val, border_mode="full")], + [adtens_val, bdtens_val], + conv.ConvOp, + excluding=["conv_gemm"], + ) + + aivec_val = [6, 2, 8, 3] + bivec_val = [4, 2, 5, 3] + adtens_val = rand(*aivec_val) + bdtens_val = rand(*bivec_val) + with pytest.warns(DeprecationWarning): + self._compile_and_check( + [adtens, bdtens], + [ + conv.conv2d( + adtens, bdtens, aivec_val, bivec_val, border_mode="valid" + ) + ], + [adtens_val, bdtens_val], + conv.ConvOp, + excluding=["conv_gemm"], + ) + + with pytest.warns(DeprecationWarning): + self._compile_and_check( + [adtens, bdtens], + [conv.conv2d(adtens, bdtens, aivec_val, bivec_val, border_mode="full")], + [adtens_val, bdtens_val], + conv.ConvOp, + excluding=["conv_gemm"], + ) + + aivec_val = [3, 6, 7, 5] + bivec_val = [5, 6, 3, 2] + adtens_val = rand(*aivec_val) + bdtens_val = rand(*bivec_val) + with pytest.warns(DeprecationWarning): + self._compile_and_check( + [adtens, bdtens], + [ + conv.conv2d( + adtens, bdtens, aivec_val, bivec_val, border_mode="valid" + ) + ], + [adtens_val, bdtens_val], + conv.ConvOp, + excluding=["conv_gemm"], + ) + + with pytest.warns(DeprecationWarning): + self._compile_and_check( + [adtens, bdtens], + [conv.conv2d(adtens, bdtens, aivec_val, bivec_val, border_mode="full")], + [adtens_val, bdtens_val], + conv.ConvOp, + excluding=["conv_gemm"], + ) + + aivec_val = [3, 6, 7, 5] + bivec_val = [5, 6, 2, 3] + adtens_val = rand(*aivec_val) + bdtens_val = rand(*bivec_val) + with pytest.warns(DeprecationWarning): + self._compile_and_check( + [adtens, bdtens], + [ + conv.conv2d( + adtens, bdtens, aivec_val, bivec_val, border_mode="valid" + ) + ], + [adtens_val, bdtens_val], + conv.ConvOp, + excluding=["conv_gemm"], + ) + + with pytest.warns(DeprecationWarning): + self._compile_and_check( + [adtens, bdtens], + [conv.conv2d(adtens, bdtens, aivec_val, bivec_val, border_mode="full")], + [adtens_val, bdtens_val], + conv.ConvOp, + excluding=["conv_gemm"], + ) + + aivec_val = [5, 2, 4, 3] + bivec_val = [6, 2, 4, 3] + adtens_val = rand(*aivec_val) + bdtens_val = rand(*bivec_val) + with pytest.warns(DeprecationWarning): + self._compile_and_check( + [adtens, bdtens], + [ + conv.conv2d( + adtens, bdtens, aivec_val, bivec_val, border_mode="valid" + ) + ], + [adtens_val, bdtens_val], + conv.ConvOp, + excluding=["conv_gemm"], + ) + + with pytest.warns(DeprecationWarning): + self._compile_and_check( + [adtens, bdtens], + [conv.conv2d(adtens, bdtens, aivec_val, bivec_val, border_mode="full")], + [adtens_val, bdtens_val], + conv.ConvOp, + excluding=["conv_gemm"], + ) + + +# Test that broadcasting of gradients works correctly when using the +# nnet.conv2d() interface. This was reported in #3763, and uses the example +# code from that ticket. +def test_broadcast_grad(): + x1 = tensor4("x") + sigma = scalar("sigma") + window_radius = 3 + + filter_1d = at.arange(-window_radius, window_radius + 1) + filter_1d = filter_1d.astype(pytensor.config.floatX) + filter_1d = exp(-0.5 * filter_1d**2 / sigma**2) + filter_1d = filter_1d / filter_1d.sum() + + filter_W = filter_1d.dimshuffle(["x", "x", 0, "x"]) + + y = conv2d(x1, filter_W, border_mode="full", filter_shape=[1, 1, None, None]) + # TODO FIXME: Make this a real test and `assert` something + pytensor.grad(y.sum(), sigma) diff --git a/tests/tensor/conv/test_conv3d2d.py b/tests/tensor/conv/test_conv3d2d.py new file mode 100644 index 0000000000..43b02f6328 --- /dev/null +++ b/tests/tensor/conv/test_conv3d2d.py @@ -0,0 +1,237 @@ +import numpy as np +import pytest + +import pytensor + + +try: + from scipy import ndimage +except ImportError: + ndimage = None + +import tests.unittest_tools as utt +from pytensor.compile.sharedvalue import shared +from pytensor.graph.rewriting.basic import check_stack_trace +from pytensor.tensor.conv.conv3d2d import ( + DiagonalSubtensor, + IncDiagonalSubtensor, + conv3d, + get_diagonal_subtensor_view, +) + + +def test_get_diagonal_subtensor_view(wrap=lambda a: a): + x = np.arange(20).reshape(5, 4).astype("float32") + x = wrap(x) + xv01 = get_diagonal_subtensor_view(x, 0, 1) + + # test that it works in 2d + assert np.array_equal(np.asarray(xv01), [[12, 9, 6, 3], [16, 13, 10, 7]]) + + x = np.arange(24).reshape(4, 3, 2) + xv01 = get_diagonal_subtensor_view(x, 0, 1) + xv02 = get_diagonal_subtensor_view(x, 0, 2) + xv12 = get_diagonal_subtensor_view(x, 1, 2) + + # print 'x', x + # print 'xv01', xv01 + # print 'xv02', xv02 + assert np.array_equal( + np.asarray(xv01), [[[12, 13], [8, 9], [4, 5]], [[18, 19], [14, 15], [10, 11]]] + ) + + assert np.array_equal( + np.asarray(xv02), + [ + [[6, 1], [8, 3], [10, 5]], + [[12, 7], [14, 9], [16, 11]], + [[18, 13], [20, 15], [22, 17]], + ], + ) + + # diagonal views of each leading matrix is the same + # as the slices out of the diagonal view of the entire 3d tensor + for xi, xvi in zip(x, xv12): + assert np.array_equal(xvi, get_diagonal_subtensor_view(xi, 0, 1)) + + +def pyconv3d(signals, filters, border_mode="valid"): + Ns, Ts, C, Hs, Ws = signals.shape + Nf, Tf, C, Hf, Wf = filters.shape + + # if border_mode is not 'valid', the signals need zero-padding + if border_mode == "full": + Tpad = Tf - 1 + Hpad = Hf - 1 + Wpad = Wf - 1 + elif border_mode == "half": + Tpad = Tf // 2 + Hpad = Hf // 2 + Wpad = Wf // 2 + else: + Tpad = 0 + Hpad = 0 + Wpad = 0 + + if Tpad > 0 or Hpad > 0 or Wpad > 0: + # zero-pad signals + signals_padded = np.zeros( + (Ns, Ts + 2 * Tpad, C, Hs + 2 * Hpad, Ws + 2 * Wpad), "float32" + ) + signals_padded[ + :, Tpad : (Ts + Tpad), :, Hpad : (Hs + Hpad), Wpad : (Ws + Wpad) + ] = signals + Ns, Ts, C, Hs, Ws = signals_padded.shape + signals = signals_padded + + Tf2 = Tf // 2 + Hf2 = Hf // 2 + Wf2 = Wf // 2 + + rval = np.zeros((Ns, Ts - Tf + 1, Nf, Hs - Hf + 1, Ws - Wf + 1)) + for ns in range(Ns): + for nf in range(Nf): + for c in range(C): + s_i = signals[ns, :, c, :, :] + f_i = filters[nf, :, c, :, :] + r_i = rval[ns, :, nf, :, :] + o_i = ndimage.convolve(s_i, f_i, mode="constant", cval=1) + o_i_sh0 = o_i.shape[0] + # print s_i.shape, f_i.shape, r_i.shape, o_i.shape + r_i += o_i[Tf2 : o_i_sh0 - Tf2, Hf2:-Hf2, Wf2:-Wf2] + return rval + + +def check_diagonal_subtensor_view_traces(fn): + assert check_stack_trace(fn, ops_to_check=(DiagonalSubtensor, IncDiagonalSubtensor)) + + +@pytest.mark.skipif( + ndimage is None or not pytensor.config.cxx, + reason="conv3d2d tests need SciPy and a c++ compiler", +) +@pytest.mark.parametrize("border_mode", ("valid", "full", "half")) +def test_conv3d(border_mode): + if pytensor.config.mode == "FAST_COMPILE": + mode = pytensor.compile.mode.get_mode("FAST_RUN") + else: + mode = pytensor.compile.mode.get_default_mode() + + Ns, Ts, C, Hs, Ws = 3, 10, 3, 32, 32 + Nf, Tf, C, Hf, Wf = 32, 5, 3, 5, 5 + + signals = ( + np.arange(Ns * Ts * C * Hs * Ws).reshape(Ns, Ts, C, Hs, Ws).astype("float32") + ) + filters = ( + np.arange(Nf * Tf * C * Hf * Wf).reshape(Nf, Tf, C, Hf, Wf).astype("float32") + ) + + # t0 = time.perf_counter() + pyres = pyconv3d(signals, filters, border_mode) + # print(time.perf_counter() - t0) + + s_signals = shared(signals) + s_filters = shared(filters) + s_output = shared(signals * 0) + + out = conv3d( + s_signals, + s_filters, + signals_shape=signals.shape, + filters_shape=filters.shape, + border_mode=border_mode, + ) + + newconv3d = pytensor.function([], [], updates={s_output: out}, mode=mode) + + check_diagonal_subtensor_view_traces(newconv3d) + # t0 = time.perf_counter() + newconv3d() + # print(time.perf_counter() - t0) + utt.assert_allclose(pyres, s_output.get_value(borrow=True)) + gsignals, gfilters = pytensor.grad(out.sum(), [s_signals, s_filters]) + gnewconv3d = pytensor.function( + [], + [], + updates=[(s_filters, gfilters), (s_signals, gsignals)], + mode=mode, + name="grad", + ) + check_diagonal_subtensor_view_traces(gnewconv3d) + + # t0 = time.perf_counter() + gnewconv3d() + # print("grad", time.perf_counter() - t0) + + Ns, Ts, C, Hs, Ws = 3, 3, 3, 5, 5 + Nf, Tf, C, Hf, Wf = 4, 2, 3, 2, 2 + + rng = np.random.default_rng(280284) + + signals = rng.random((Ns, Ts, C, Hs, Ws)).astype("float32") + filters = rng.random((Nf, Tf, C, Hf, Wf)).astype("float32") + utt.verify_grad( + lambda s, f: conv3d(s, f, border_mode=border_mode), + [signals, filters], + eps=1e-1, + mode=mode, + ) + + # Additional Test that covers the case of patched implementation for filter with Tf=1 + Ns, Ts, C, Hs, Ws = 3, 10, 3, 32, 32 + Nf, Tf, C, Hf, Wf = 32, 1, 3, 5, 5 + + signals = ( + np.arange(Ns * Ts * C * Hs * Ws).reshape(Ns, Ts, C, Hs, Ws).astype("float32") + ) + filters = ( + np.arange(Nf * Tf * C * Hf * Wf).reshape(Nf, Tf, C, Hf, Wf).astype("float32") + ) + + # t0 = time.perf_counter() + pyres = pyconv3d(signals, filters, border_mode) + # print(time.perf_counter() - t0) + + s_signals = shared(signals) + s_filters = shared(filters) + s_output = shared(signals * 0) + + out = conv3d( + s_signals, + s_filters, + signals_shape=signals.shape, + filters_shape=filters.shape, + border_mode=border_mode, + ) + + newconv3d = pytensor.function([], [], updates={s_output: out}, mode=mode) + + # t0 = time.perf_counter() + newconv3d() + # print(time.perf_counter() - t0) + utt.assert_allclose(pyres, s_output.get_value(borrow=True)) + gsignals, gfilters = pytensor.grad(out.sum(), [s_signals, s_filters]) + gnewconv3d = pytensor.function( + [], + [], + updates=[(s_filters, gfilters), (s_signals, gsignals)], + mode=mode, + name="grad", + ) + + # t0 = time.perf_counter() + gnewconv3d() + # print("grad", time.perf_counter() - t0) + + Ns, Ts, C, Hs, Ws = 3, 3, 3, 5, 5 + Nf, Tf, C, Hf, Wf = 4, 1, 3, 2, 2 + + signals = rng.random((Ns, Ts, C, Hs, Ws)).astype("float32") + filters = rng.random((Nf, Tf, C, Hf, Wf)).astype("float32") + utt.verify_grad( + lambda s, f: conv3d(s, f, border_mode=border_mode), + [signals, filters], + eps=1e-1, + mode=mode, + ) diff --git a/tests/tensor/signal/__init__.py b/tests/tensor/signal/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/tensor/signal/test_conv.py b/tests/tensor/signal/test_conv.py new file mode 100644 index 0000000000..c2d1475ca0 --- /dev/null +++ b/tests/tensor/signal/test_conv.py @@ -0,0 +1,30 @@ +from functools import partial + +import numpy as np +import pytest +from scipy.signal import convolve as scipy_convolve + +from pytensor import function +from pytensor.tensor import vector +from pytensor.tensor.signal.conv import convolve +from tests import unittest_tools as utt + + +@pytest.mark.parametrize("kernel_shape", [3, 5, 8], ids=lambda x: f"kernel_shape={x}") +@pytest.mark.parametrize("data_shape", [3, 5, 8], ids=lambda x: f"data_shape={x}") +@pytest.mark.parametrize("mode", ["full", "valid", "same"]) +def test_convolve(mode, data_shape, kernel_shape): + data = vector("data") + kernel = vector("kernel") + op = partial(convolve, mode=mode) + + rng = np.random.default_rng() + data_val = rng.normal(size=data_shape) + kernel_val = rng.normal(size=kernel_shape) + + fn = function([data, kernel], op(data, kernel)) + np.testing.assert_allclose( + fn(data_val, kernel_val), + scipy_convolve(data_val, kernel_val, mode=mode), + ) + utt.verify_grad(op=lambda x: op(x, kernel_val), pt=[data_val])