From 286822a7ee168a34aaeb79b64dc5860fa0554070 Mon Sep 17 00:00:00 2001
From: CoprDistGit <infra@openeuler.org>
Date: Mon, 15 May 2023 05:05:38 +0000
Subject: automatic import of python-cntkx

---
 .gitignore        |    1 +
 python-cntkx.spec | 3075 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 sources           |    1 +
 3 files changed, 3077 insertions(+)
 create mode 100644 python-cntkx.spec
 create mode 100644 sources

diff --git a/.gitignore b/.gitignore
index e69de29..e561edc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+/cntkx-0.1.54.tar.gz
diff --git a/python-cntkx.spec b/python-cntkx.spec
new file mode 100644
index 0000000..bc17527
--- /dev/null
+++ b/python-cntkx.spec
@@ -0,0 +1,3075 @@
+%global _empty_manifest_terminate_build 0
+Name:		python-cntkx
+Version:	0.1.54
+Release:	1
+Summary:	Extension library of Microsoft Cognitive Toolkit
+License:	MIT License
+URL:		https://github.com/delzac/cntkx
+Source0:	https://mirrors.nju.edu.cn/pypi/web/packages/73/99/51046237804d7a9c2af64f8804c816552a6ae83476fb4247ef4b6621ab2a/cntkx-0.1.54.tar.gz
+BuildArch:	noarch
+
+
+%description
+# CNTKx
+CNTKx is a deep learning library that builds on and extends Microsoft Cognitive Toolkit [CNTK](https://github.com/Microsoft/CNTK). 
+Despite the last planned release of cntk 2.7, cntkx will continue to be in active development, more models and pre-built components coming soon!
+
+Feel free to open an issue for any request or a PR to contribute :)
+
+## Installation
+cntkx is written in pure python and cntk is a dependency to it. Please get a working installation of cntk first. Then:
+
+    pip install cntkx
+
+cntkx only works with `python>=3.6`
+
+
+## Available Components
+| ops | Description |
+| --- | ---|
+| `floor_division` | element-wise floor_division |
+| `remainder` | element-wise remainder of division |
+| `scalar` | cast tensor to scalar (1,) |
+| `cumsum` | Cumulative summation along axis |
+| `upsample` | Upsample by k factor (for image) |
+| `centre_crop` | Crop centre of image |
+| `swish` | Activation |
+| `mish` | Activation |
+| `hardmax` | Activation |
+| `erf` | Error function |
+| `gelu` | Gaussian Error Linear Unit function |
+| `gelu_fast` | fast approximation of Gaussian Error Linear Unit function |
+| `sequence.pad` | Pad at start or end of sequence axis |
+| `sequence.pad_to` | Pad a sequence to have the same length as another sequence |
+| `sequence.length` | length of sequence |
+| `sequence.position` | position of every sequence element |
+| `sequence.stride` | strides across sequential axis  |
+| `sequence.join` | joins two sequence along their sequential axis  |
+| `sequence.window` | creates sliding window along the sequence axis  |
+| `sequence.window_causal` | creates causal sliding window along the sequence axis  |
+| `sequence.reverse` | reverses the items along the dynamic sequence axis  |
+| `sequence.reduce_mean` | calculates the mean along the dynamic sequence axis  |
+| `sequence.pad_ctc_labels` | padded ctc labels to be the same sequence length as the network output  |
+| `sequence.reduce_concat_pool` | drop-in replace for sequence.last  |
+| `random.sample` | Samples an unnormalised log probability distribution |
+| `random.sample_with_bias` | Samples an unnormalised log probability distribution over-weighted to more probable classes |
+| `random.sample_top_k` | Samples from the top_k of an unnormalised log probability distribution |
+| `batchmatmul` | Batch Matrix Multiplication on a static batch axis, similar to tf.matmul |
+
+| Layers | Description |
+| --- | ---|
+| `QRNN` | Quasi-Recurrent Neural Network |
+| `Recurrence` | With option to apply `VariationalDroppout` |
+| `PyramidalBiRecurrence` | Pyramidal bi-directional recurrence |
+| `VariationalDropout` | Single binary dropout mask for entire sequence |
+| `SinusoidalPositionalEmbedding` | Non-learnable positional embedding (no max sequence length) |
+| `PositionalEmbedding` | Learnable Positional Embedding (used in BERT) |
+| `BertEmbeddings` | BERT Embeddings (word + token_type + positional) |
+| `BertPooler` | Pooler used in BERT |
+| `SpatialPyramidPooling` | Fixed pooled representation regardless of image input size |
+| `GatedLinearUnit` | Gated Convolutional Neural Network |
+| `ScaledDotProductAttention` | Attention used in BERT and Transformer (aka 'attention is all you need') |
+| `MultiHeadAttention` | Attention used in BERT and Transformer (aka 'attention is all you need') |
+| `GaussianWindowAttention` | Windowed attention instead of conventional attention where everything is attended at the same time |
+| `SequentialDense` | Applies Dense to a window of sequence item along sequence axis |
+| `SequentialMaxPooling` | Max pool across sequential axis and static axes |
+| `SequentialAveragePooling` | Average pool across sequential axis and static axes |
+| `SequentialConcatPooling` | Concat Average and Mean pool across sequential axis and static axes |
+| `vFSMN` | Vectorised Feedforward Sequential Memory Networks |
+| `cFSMN` | Compact Feedforward Sequential Memory Networks |
+| `BiRecurrence` | BiRecurrence recurrent layer with weight tying option to half parameter requirement |
+| `GlobalConcatPooling` | Global spatial concat pooling of ave and mean |
+|`FilterResponseNormalization`| Drop in replacement for batch norm with superior performance |
+|`Boom`| More parametrically efficient alternative to Position-Wise FeedForward layer found in Transformer |
+|`GaussianAttentionSeqImage`| Memory efficient attention that used 2d gaussian filters for images |
+| `SequenceDropout` | Dropout entire sequence elements |
+| `SEBlock` | Squeeze and Excitation block |
+| `SequenceSEBlock` | Squeeze and Excitation block for variable width image sequence |
+| `SIREN` | Sinusoidal Representation Network |
+| `LinearAttention` | Linearised form of dot-product attention with linear memory complexity |
+| `LinearAttentionModel` | Wrapper to `LinearAttention` |
+
+
+| Blocks | Description |
+| --- | ---|
+| `WeightDroppedLSTM` | A form of regularised LSTM |
+| `IndyLSTM` | A parameter efficient form of LSTM |
+| `IndRNN` | a RNN with long memory and can be stacked deeply |
+
+| Loss | Description |
+| --- | ---|
+| `gaussian_mdn_loss` | loss function when using Mixture density network |
+| `focal_loss_with_softmax` | A kind of cross entropy that handles extreme class imbalance |
+| `cross_entropy_with_softmax` | Added `label smoothing regularisation` in cross entropy with softmax |
+| `generalised_robust_barron_loss` | generalised robust loss |
+
+| Models | Description |
+| --- | ---|
+| `VGG` | Image Classification |
+| `UNET` | Semantic Segmentation |
+| `Transformer` | Language Modelling |
+| `MDN` | Mixture Density Networks | 
+
+
+| Pre-trained models | Description |
+| --- | ---|
+| `Bert` | Bidirectional Encoder Representations from Transformers |
+| [fwd_wt103.hdf5](https://1drv.ms/u/s!AjJ4XyC3prp8mItNxiawGK4gD8iMhA?e=wh7PLB) | The weight parameters of the fastai's pytorch model. To be used to initialise `PretrainedWikitext103LanguageModel` |
+| [fwd_wt103.cntk](https://1drv.ms/u/s!AjJ4XyC3prp8mItPBdfmDYr9QP7J4w?e=k1BXlW) | The converted cntk model of fastai's pytorch model. To be used with `C.load_model` |
+| [fwd_wt103.onnx](https://1drv.ms/u/s!AjJ4XyC3prp8mItO70T_q8HOPwa6aQ?e=h2Fiv5) | The converted ONNX model of fastai's pytorch model. |
+
+
+| Learners | Description |
+| --- | ---|
+| `CyclicalLearningRate` | a method to eliminate the need to find best value and schedule for learning rate |
+| `RAdam` | a variant of `Adam` that doesn't require any warmup |
+
+| Misc | Description |
+| --- | ---|
+| `CTCEncoder` | Helper class to convert data into a format acceptable for cntk's ctc implementation |
+
+
+## C# CNTK Tutorials
+This library is implemented in pure cntk python API. For help in cntk c#, you can refer to the two repository 
+[deep-learning-with-csharp-and-cntk](https://github.com/anastasios-stamoulis/deep-learning-with-csharp-and-cntk) 
+and [DeepBelief_Course4_Examples](https://github.com/AllanYiin/DeepBelief_Course4_Examples). 
+
+
+## F# CNTK
+For the F# wrapper of CNTK please visit [FsCNTK](https://github.com/fwaris/FsCNTK), 
+it also contains some example implementations like seq2seq, autoencoder, LSTM, GAN.
+
+
+## News
+***2021-04-04***
+### Added new kernel method into `LinearAttention`
+Previous `LinearAttention` uses implemntation from [Transformers are RNNs:Fast Autoregressive Transformers with Linear Attention](https://arxiv.org/abs/2006.16236) that uses `elu`.
+`elu` as kernel may not perform well in certain task (e.g. ASR) as documented in the paper.
+
+Hence, new kernel trick uses `softmax` from [Efficient Attention: Attention with Linear Complexities](https://arxiv.org/abs/1812.01243).
+
+
+
+***2020-09-06***
+### Added `sequence.pad_ctc_labels`
+Pads the ctc label sequence to the same sequence length as the network output.
+This should be used when the final sequence length of the network output cannot be determined
+beforehand during the pre-processing of the ctc_labels. Thus, the padding is done during training runtime
+instead of during the data pipeline processing.
+
+The padding token would be the last sequence element of `ctc_labels`. `ctc_labels` should be
+a one hot encoded vector sequence. The padding token will have the value of 1 in its one-hot encoded vector.
+
+
+***2020-06-20***
+### Added `LinearAttention` and `LinearAttentionModel`
+Added cntk implementation of `LinearAttention` and `LinearAttentionModel`.
+Standard dot-product attention found in `Transformer` is quadratic in time and gpu memory complexity
+with respect to sequence length. The `C.layers.AttentionModel` is also quadratic.
+
+However, `LinearAttention` is a linearised form of dot-product attention. It is linear in time and gpu memory complexity
+with respect to sequence length. This is especially significant since `cntk` doesn't have any checkpointing function
+compared to other frameworks like `tensorflow` and `pytorch`, where it saves gpu memory at the expense of some
+computation overhead. Now, with `LinearAttention` training of `Transformer` can be done on `cntk` since the
+ gpu memory requirement is drastically reduced.
+
+`LinearAttentionModel` is wrapper for `LinearAttention` in the style of `C.layers.AttentionModel`.
+
+For more details refer to [Transformers are RNNs:Fast Autoregressive Transformers with Linear Attention](https://arxiv.org/abs/2006.16236) by Katharopoulos et al.
+
+Example:
+
+    a = C.sequence.input_variable(24)
+    b = LinearAttention(hidden_dim=32, model_dim=24)(a, a, a)
+
+    assert b.shape == (32, )
+
+
+***2020-06-20***
+### Added `sequence.pad_to`
+`sequence.pad_to` pads a shorter sequence to the same sequence length as another longer sequence.
+This is especially useful in CTC training where both sequences need to have the same sequence length
+
+Example:
+
+    ax1 = C.Axis.new_unique_dynamic_axis('ax1')
+    ax2 = C.Axis.new_unique_dynamic_axis('ax2')
+    a = C.sequence.input_variable(3, sequence_axis=ax1)
+    b = C.sequence.input_variable(6, sequence_axis=ax2)
+
+    c = Cx.sequence.pad_to(a, b)  # pad to same sequence length
+
+    ctc_token = C.Constant(np.array([0, 0, 1]))
+    d = C.element_select(C.equal(c, 0), ctc_token, c)  # swap padded zeros with ctc token
+
+
+***2020-06-19***
+### Added `SIREN`
+`SIREN` leverages periodic activation functions for implicit neural representations and demonstrate
+that these networks are ideally suited for representing complex natural signals and their derivatives.
+
+The results are incredible. It is highly recommended that you check out their [project page](https://vsitzmann.github.io/siren/).
+
+More details can be found in their paper [Implicit Neural Representations with PeriodicActivation Functions](https://arxiv.org/abs/2006.09661)
+
+
+
+***2020-05-01***
+### Added `SEBlock` and `SequenceSEBlock`
+`SEBlock` and `SequenceSEBlock` are squeeze-excitation blocks that adaptively recalibrates channel-wise feature
+responses by explicitly modelling interdependencies between channels. It was show that these blocks can be
+stacked together to form SENet architectures that generalise extremely effectively across different datasets.
+
+Its the winner for ILSVRC2017 classification submission. More details can be found in the
+paper [Squeeze-and-Excitation Networks](https://arxiv.org/abs/1709.01507) by Jie hu et al.
+
+
+***2020-04-20***
+### Added `SequenceDropout`
+`SequenceDropout` dropouts entire sequence elements along the dynamic sequence axis.
+
+
+
+***2020-04-15***
+### Added `GaussianAttentionSeqImage`
+`GaussianAttentionSeqImage` is a 2d spatial gaussian attention.
+To use, the encoded image should be formulated as a cntk sequence (example below). This can be useful when 
+you are constraint by gpu memory as 2d gaussian attention is more memory efficient than standard attention.
+
+This is from the deepmind paper, DRAW: A Recurrent Neural Network for Image Generation by Gregor et al
+More details can be found in the following https://arxiv.org/abs/1502.04623
+
+Example:
+
+    n = 5
+    num_channels = 3
+    image_height = 64
+    expected_image_width = 1000
+    image_seq = C.sequence.input_variable((num_channels, image_height))  # rgb image with variable width and fixed height
+    decoder_hidden_state = ...  # from decoder somewhere in the network
+    attended_image = Cx.layers.GaussianAttentionSeqImage(n, image_height, expected_image_width)(image_seq, decoder_hidden_state)
+
+    assert attended_image.shape == (num_channels, n, n)
+
+
+***2020-03-29.***
+#### Added `generalised_robust_barron_loss` and `sample_with_bias`
+`generalised_robust_barron_loss` is a generalisation for generalization of the Cauchy/Lorentzian,
+Geman-McClure, Welsch/Leclerc, generalized Charbonnier, Charbonnier/pseudo-Huber/L1-L2, and
+L2 loss functions.
+
+Can be used as a drop-in replacement in any regression task that you have.
+
+For more details, please refer to [A General and Adaptive Robust Loss Function](https://arxiv.org/abs/1701.03077), Jonathan T. Barron,
+or this [video](https://www.youtube.com/watch?v=BmNKbnF69eY).
+It is the Best Paper Award Finalist in CVPR 2019.
+
+
+Implemented `sample_with_bias` to sample more likely classes as a replacement 
+for `sample_top_k` which cannot be used inside a `UnfoldFrom`. `sample_with_bias` reduces sampling variance especially when you have a long tail.
+
+
+
+***2020-02-11.***
+#### Added `Boom`
+Boom layer from SHA-RNN by S. Merity creator of QRNN. Alternative to PositionwiseFeedForward. Serves the same function as
+PositionwiseFeedForward found in transformer.
+
+Boom layer minimizes computation and removes an entire matrix of parameters compared to traditional down-projection layers.
+
+For more detail please read: [Single Headed Attention RNN: Stop Thinking With Your Head](https://arxiv.org/abs/1911.11423) by Stephen Merity.
+
+
+***2019-12-03.***
+#### Added `FilterResponseNormalization` and `ThresholdedLinearUnit`
+Added cntk implementation of `FilterResponseNormalization`. Filter Response Normalization (FRN) layer 
+is a novel combination of a normalization and an activation function,
+that can be used as a drop-in replacement for other normalizations and activations.
+
+The method operates on each activation map of each batch sample
+independently, eliminating the dependency on other batch samples or channels of the same sample.
+The method outperforms BN and all alternatives in a variety of settings for all batch sizes.
+FRN layer performs â‰ˆ0.7âˆ’1.0% better on top-1 validation accuracy than BN with large mini-batch sizes on
+Imagenet classification on InceptionV3 and ResnetV2-50 architectures. Further, it performs >1% better
+than GN on the same problem in the small mini-batch size regime. For object detection problem on COCO dataset,
+FRN layer outperforms all other methods by at least 0.3âˆ’0.5% in all batch size regimes.
+
+Please refer to the paper [Filter Response Normalization Layer: Eliminating Batch Dependence 
+in the Training of Deep Neural Networks](https://arxiv.org/abs/1911.09737v1) for more details .
+
+
+
+***2019-11-04.***
+#### Added `ops.floor_division`, `ops.remainder`, `sequence.window_causal` and `SequentialDense`
+Add in new operations stated above. `sequence.window` now has an additional argument that lets you control striding.
+`sequence.window_causal` creates causal window that doesn't leak future information into the past (preserve causality).
+`SequentialDense` convenience layer added to apply dense to a window of sequence item,
+much like `SequentialConvolution` but with better memory performance.
+
+
+
+***2019-11-04.***
+#### Added `SequentialConcatPooling`, `Cx.Sequence.reduce_concat_pool` and `GlobalConcatPooling`
+`Cx.Sequence.reduce_concat_pool` concatenates the last item in the sequence axis with the summarisation
+of the sequence represented by `reduce_max` and `reduce_mean` of the sequence axis. Anytime `C.sequence.last` is used,
+this can be a drop-in replacement.
+
+Example:
+
+    n = 32
+    a = C.sequence.input_variable(n)
+    b = Cx.sequence.reduce_concat_pool(a)
+
+    assert b.shape == (n * 3, )
+
+
+`SequentialConcatPooling` does spatial concat pooling over the sequential axis.
+Concat pooling is the concatenation of both average pooling and max pooling. In any situation where max or ave 
+pooling is appropriate, concat pooling can be used as a drop-in replacement and achieve improvements in performance.
+
+Example:
+
+    a = C.sequence.input_variable((3, 10))
+    b = SequentialConcatPooling(filter_shape=(2, 2), strides=2)(a)
+
+    assert b.shape == (6, 10)
+
+    n = [np.random.random((3, 10)).astype(np.float32),
+         np.random.random((3, 10)).astype(np.float32),
+         np.random.random((3, 10)).astype(np.float32),
+         np.random.random((3, 10)).astype(np.float32), ]
+
+    print(b.eval({a: n}))
+
+
+`GlobalConcatPooling` is the standard spatial concat pooling of both max pool and ave pool.
+
+
+***2019-10-15.***
+#### Added `mish` activation function and `RAdam` learner.
+`mish` is an activation function is introduced in [Mish: A Self Regularized Non-Monotonic Neural Activation Function](https://arxiv.org/abs/1908.08681v2)
+by Diganta Misra. Experiments show that `mish` tends to work better than both ReLU and Swish along with other standard
+activation functions in many deep networks across challenging datasets. For instance,
+in Squeeze Excite Net-18 for CIFAR 100 classification, the network with Mish had an increase in
+Top-1 test accuracy by 0.494% and 1.671% as compared to the same network with Swish and ReLU respectively.
+The similarity to Swish along with providing a boost in performance and its simplicity in implementation
+makes it easier for researchers and developers to use Mish in their Neural Network Models.
+
+This activation function is adopted in `fast ai`.
+
+Rectified Adam or `RAdam` is a `Adam` optimiser variant that doesn't require a warmup schedule, which `Adam` tends
+to need to maintain stability. In this cntk implementation, we added a `RAdam` like optimiser based
+on the work of [On the adequacy of untuned warmup for adaptive optimization](https://arxiv.org/abs/1910.04209) by Jerry Ma and Denis Yarats.
+
+`RAdam` is adopted in `fast ai` too.
+
+
+
+***2019-09-30.***
+#### Added `BiRecurrence` with weight tying
+Made improvement to weight tying of BiRecurrence by have one parameter tensor token for every state 
+in the step function per direction (forward and backward). This will allow forward and backward token
+to have more representational flexibility. Previously, all states use the same forward or backward token.
+
+
+***2019-09-06.***
+#### Added `BiRecurrence` with weight tying
+Add a wrapper to create a bidirectional recurrent layer using `BiRecurrence`. Included in the implementation
+is an option to half the number of parameters required by  bidirectional recurrent layer. This is done
+by only using one recurrent unit to do both forward and backward computation instead of the usual two.
+A forward and backward token is used to initialise the hidden state so that the recurrent unit can tell
+the directionality.
+
+More details can be found in the paper [Efficient Bidirectional Neural Machine Translation](https://arxiv.org/abs/1908.09329)
+
+Example:
+
+    a = C.sequence.input_variable(10)
+    b = BiRecurrence(LSTM(100), weight_tie=True)(a)
+
+    assert b.shape == (200, )
+
+
+***2019-08-29.***
+#### Added `PretrainedWikitext103LanguageModel`
+CNTK implementation of Fast AI's Universal  Language  ModelFine-tuning (ULMFiT) English model has been added.
+This language model was trained on Wikitext-103 and can be used as a base model for any downstream language task.
+
+It is also much more efficient to run compare to BERT and other Transformer language models.
+
+For more details, you can refer to the original paper [here](https://arxiv.org/abs/1801.06146)
+
+Example:
+
+    vocab_size = 238462
+    converted_hdf5_model_file_path = 'PATH/fwd_wt103.hdf5'  # this is not the original pytorch model
+    lm = PretrainedWikitext103LanguageModel(converted_hdf5_model_file_path)
+
+    a = C.sequence.input_variable(vocab_size)
+    prediction = lm(a)  # next-word-prediction
+    features = prediction.features  # features of tokens
+
+    assert prediction.shape == (vocab_size, )
+    assert features.shape == (400, )
+
+| Model | Description |
+| --- | ---|
+| [fwd_wt103.hdf5](https://1drv.ms/u/s!AjJ4XyC3prp8mItNxiawGK4gD8iMhA?e=wh7PLB) | The weight parameters of the fastai's pytorch model. To be used to initialise `PretrainedWikitext103LanguageModel` |
+| [fwd_wt103.cntk](https://1drv.ms/u/s!AjJ4XyC3prp8mItPBdfmDYr9QP7J4w?e=k1BXlW) | The converted cntk model of fastai's pytorch model. To be used with `C.load_model` |
+| [fwd_wt103.onnx](https://1drv.ms/u/s!AjJ4XyC3prp8mItO70T_q8HOPwa6aQ?e=h2Fiv5) | The converted ONNX model of fastai's pytorch model. |
+| [itos_wt103.pkl](http://files.fast.ai/models/wt103/) | Tokens used in pretrained model |
+
+
+***2019-08-08.***
+#### Added `cntkx.ops.gelu` and `cntkx.ops.gelu_fast`
+Added two cntk implementation of `gelu` activation function. `gelu` activation is used in `BERT`
+and OpenAI's `GPT` and `GPT-2`.
+
+Gaussian Error Linear Unit (GELU), a high-performing neuralnetwork activation function.
+The GELU nonlinearity is the expected transforma-tion of a stochastic regularizer which randomly
+applies the identity or zero mapto a neuronâ€™s input.  The GELU nonlinearity weights inputs by their
+magnitude,rather than gates inputs by their sign as in ReLUs.
+
+For more detail please refer to [Gaussian Error Linear Units (GELU)](https://arxiv.org/abs/1606.08415)
+by Hendrycks and Gimpel.
+
+
+***2019-07-04.***
+#### Added `cntkx.ops.sequence.reduce_mean`
+Calculates the mean along the dynamic sequential axis in CNTK.
+
+
+***2019-06-26.***
+#### Added `cntkx.ops.sequence.reverse`
+Allows the sequence items along the sequence axis to be reversed. This is useful when you want to create a 
+Bi-directional Auto-Regressive layer because using UnfoldFrom does not work with Recurrence(go_backwards=True) and
+ will result in a ValueError.
+
+
+Example:
+
+    import cntk as C
+    import cntkx as Cx
+    from cntk.layers import Recurrence, UnfoldFrom, LSTM
+
+    hidden_dim = 50
+    start_token = C.Constant(0, shape=(hidden_dim,))
+    a = C.sequence.input_variable(1, name='seq1')
+
+    b = UnfoldFrom(Recurrence(LSTM(hidden_dim), go_backwards=True))(start_token, a)
+
+    n = [np.random.random((10, hidden_dim)).astype(np.float32),]
+
+    # This raise 'ValueError: It is not allowed to have multiple different stepping directions in the same loop'
+    b.eval({b.arguments[0]: n})
+
+
+The workaround would be:
+
+    import cntk as C
+    import cntkx as Cx
+    from cntk.layers import Recurrence, UnfoldFrom, LSTM
+
+    hidden_dim = 50
+    start_token = C.Constant(0, shape=(hidden_dim,))
+    a = C.sequence.input_variable(1, name='seq1')
+    a_reversed = Cx.sequence.reverse(a)
+
+    b = UnfoldFrom(Recurrence(LSTM(hidden_dim)))(start_token, a_reversed)  # remove go_backwards=True
+
+    n = [np.random.random((10, hidden_dim)).astype(np.float32),]    
+    b.eval({b.arguments[0]: n})  # this will run just fine
+
+
+
+***2019-06-21.***
+#### Added `cntkx.ops.random.sample_top_k`
+CNTK implementation of that allows sampling of the top_k unnormalised log-probabilities distribution.
+This is useful text (or sequence) generation where it is known that greedy decoding will cause
+text degeneration.
+
+For more details on this, please refer to [A curious case of neural text degeneration](https://arxiv.org/abs/1904.09751)
+
+
+Example:
+
+    import cntk as C
+    import cntkx as Cx
+
+    a = C.input_variable(5)
+    b = Cx.random.sample_top_k(a, k=3, num_classes=5)
+
+    n = np.array([[1, 2, 3, 4, 5],] * 1000)
+
+    results = b.eval({a: n})
+    assert np.sum(results[:, :2]) == 0
+    assert np.sum(results[:, 2:]) == 1000
+
+***2019-05-04.***
+#### Added `cntkx.layers.vFSMN` and `cntkx.layers.cFSMN`
+CNTK implementation of Bidirectional vectorised Feedforward Sequential Memory Network (vFSMN)
+and Compact Feedforward Sequential Memory Network (cFSMN).
+
+FSMN is a standard fully-connected feedforward neural network equipped
+with some learnable memory blocks in its hidden layers. The memory blocks
+use a tapped-delay line structure to encode the long context information into
+a fixed-size representation as short-term memory mechanism.
+
+The authors claim that FSMNs can be learned much more reliably and faster than
+RNNs or LSTMs due to the inherent non-recurrent model structure while significantly
+outperforming RNNs in language and speech modeling.
+
+For more details please refer to [Feedforward Sequential Memory Networks: A 
+New Structure to Learn Long-term Dependency](https://arxiv.org/abs/1512.08301)
+
+cFSMN is a compact version of FSMN that can result in a reduction of up
+to 60% in model size and speed up the learning by more than 7 times while
+still significantly outperforming the popular bi-direction LSTMs for both
+frame-level cross-entropy (CE) criterion based training and MMI based sequence training.
+
+For more details please refer to "Compact Feedforward Sequential Memory Networks for
+Large VocabularyContinuous Speech Recognition" by Zhang, et al.
+
+Example:
+
+    import cntk as C
+    from cntkx.layers import vFSMN, cFSMN
+
+    # unidirectional vFSMN (only past conext used)
+    a = C.sequence.input_variable(10)
+    b = vFSMN(100, C.relu, num_past_context=3, num_future_context=0)(a)
+
+    assert b.shape == (100,)
+
+    # bidirectional vFSMN (enable both past and future context)
+    a = C.sequence.input_variable(10)
+    b = vFSMN(120, C.relu, num_past_context=3, num_future_context=3)(a)
+
+    assert b.shape == (120,)
+
+    # bidirectional cFSMN (enable both past and future context)
+    a = C.sequence.input_variable(100)
+    b = cFSMN(120, 50, C.relu, num_past_context=3, num_future_context=3)(a)
+
+    assert b.shape == (120,)
+
+
+***2019-04-19.***
+#### Added `cntkx.misc.CTCEncoder`
+CNTK's CTC implementation requires that data be formatted in a particular way that's typically in acoustic
+modeling but unusual in other applications. So class provides an easy way to convert data between
+what users typically expect and what cntk demands.
+
+Example:
+    labels = ['a', 'b', 'c']
+    encoder = CTCEncoder(labels)
+
+    labels_tensor = C.sequence.input_variable(len(encoder.classes_))  # number of classes = 4
+    input_tensor = C.sequence.input_variable(100)
+
+    labels_graph = cntk.labels_to_graph(labels_tensor)
+    network_out = model(input_tensor)
+
+    fb = C.forward_backward(labels_graph, network_out, blankTokenId=encoder.blankTokenId)
+
+    ground_truth = ['a', 'b', 'b', 'b', 'c']
+    seq_length = 10  # must be the same length as the sequence length in network_out
+
+    fb.eval({input_tensor: [...],
+             labels_tensor: [encoder.transform(ground_truth, seq_length=seq_length)]})
+
+
+
+***2019-04-14.***
+#### Added `Label Smoothing Regularization`, `seqeuence.window` and `PyramidalBiRecurrence`
+Added `Label Smoothing Regularization` in `cross_entropy_with_softmax`.
+Added `sequence.window` that creates non-overlapping window along the sequence axis thereby reducing the 
+sequence length and increasing the dimension by the same factor.
+
+Implemented a convenience layer used in acoustic modelling known as `PyramidalBiRecurrence`. Used to create
+pyramidal bi-directional LSTM (BLSTM) found in "Listen, attend and spell" by Chan et al. (https://arxiv.org/abs/1508.01211)
+Typically used to down sample the sequence length to make memory and runtime manageable.
+
+
+***2019-04-08.***
+#### Added `cntkx.ops.sequence.join`
+Added a new `op` called `join` where two sequence tensors can be joined along with sequence axis forming a longer sequence.
+
+
+***2019-04-08.***
+#### Added `cntkx.layers.SequentialAveragePooling`
+Add average pooling layer that works with sequential axis. Current cntk `AveragePooling` doesn't pool across sequence elements.
+
+Example on `cntkx.layers.SequentialAveragePooling`
+
+    # rgb image of height 25 and variable width
+    a = C.sequence.input_variable((3, 25))
+
+    # Convolute across image with (3, 3) kernel with stride (1, 1)
+    b = C.layers.SequentialConvolution(filter_shape=(3, 3), num_filters=16, stride=(1, 1), pad=True)(a)
+
+    assert b.shape == (16, 25)
+
+    # max pool (2,2) in height and width with stride (2,2) in height and width, no padding
+    c = SequentialAveragePooling(filter_shape=(2, 2), strides=(2, 2), pad=False)(b)
+
+    assert c.shape == (16, 12)
+
+
+***2019-04-07.***
+#### Added `cntkx.sequence.stride` and `cntkx.ops.scalar`
+`Cx.sequence.stride` enables striding across the sequential axis, selecting every integer items along the sequence.
+`Cx.scalar` converts tensor into scalar of shape `(1,)` 
+
+
+
+***2019-04-06.***
+#### Added `IndyLSTM` and `IndRNN`
+CNTK implementation of [Independently Recurrent Long Short-term Memory cells: IndyLSTMs](https://arxiv.org/abs/1903.08023)
+by Gonnet and Deselaers, and [Independently Recurrent Neural Network (IndRNN): Building A Longer andDeeper RNN](https://arxiv.org/abs/1803.04831)
+by Li, et al.
+
+Both `IndyLSTM` and `IndRNN` have hidden-to-hidden weights that are diagonal matrix instead of the usual full matrix.
+Thus neurons in each layer are independent from each other, and the cross-channel information is 
+obtained through stacking multiple layers.
+
+`IndRNN` allows for the use of `C.relu` activation thus allowing multiple `IndRNN` layers to be stacked together deeply.
+
+`IndyLSTM` has parameters linear to the number of nodes in the linear, as opposed to standard LSTM that is quadratic
+making `IndyLSTM` potentially faster and smaller as a model.
+
+Authors of both `IndRNN` and `IndyLSTM` have claimed performance as good as or even better than regular LSTMs.
+
+Example:
+
+    import cntk as C
+    from cntkx.layers import IndyLSTM, IndRNN, Recurrence
+
+    a = C.sequence.input_variable(10)
+    b = Recurrence(IndRNN(20))(a)
+    c = Recurrence(IndyLSTM(20))(a)
+
+    assert b.shape == c.shape == (20,)
+
+
+
+***2019-03-24.***
+#### Added `cntkx.layers.SequentialMaxPooling`
+Add max pooling layer that works with sequential axis. Current cntk `MaxPooling` doesn't pool across sequence elements.
+
+Example on `cntkx.layers.SequentialMaxPooling`
+
+    # rgb image of height 25 and variable width
+    a = C.sequence.input_variable((3, 25))
+
+    # Convolute across image with (3, 3) kernel with stride (1, 1)
+    b = C.layers.SequentialConvolution(filter_shape=(3, 3), num_filters=16, stride=(1, 1), pad=True)(a)
+
+    assert b.shape == (16, 25)
+
+    # max pool (2,2) in height and width with stride (2,2) in height and width, no padding
+    c = SequentialMaxPooling(filter_shape=(2, 2), strides=(2, 2), pad=False)(b)
+
+    assert c.shape == (16, 12)
+
+
+***2019-03-18.***
+#### Added `cntkx.learners.CyclicalLearningRate`
+Cyclical learning rate (CLR) is an implementation to that  practically eliminates the need to 
+experimentally find the best values and schedule  for  the global  learning  rates.
+
+Instead  of  monotonically decreasing the learning rate, this method lets the learning  
+rate  cyclically  vary  between  reasonable  boundary  values. Training  with  
+cyclical  learning  rates  instead of  fixed  values  achieves improved  classification 
+accuracy without a need to tune and often in fewer iterations.
+
+More details can be found in [Cyclical Learning Rates for Training Neural Networks](https://arxiv.org/abs/1506.01186) 
+by Leslie N. Smith
+
+This CLR implementation can be used with the cntk training loop by adding only ***two lines of code***:
+
+    model = C.layers.Dense(10)(C.input_variable(10))
+    sgd_momentum = C.momentum_sgd(model.parameters, 0.1, 0.9)
+    clr = CyclicalLeaningRate(sgd_momentum, minibatch_size=32)  # first line of code
+
+    for epoch in range(10):
+        for batch in range(100):
+            trainer.train_minibatch(...)
+            clr.batch_step()  # second line of code (to be called after every training update)
+
+
+
+
+***2019-03-12.***
+#### Added `cntkx.ops.batchmatmul`
+Added Batch Matrix Multiplication. This implementation is similar 
+to [tensorflow.matmul](https://www.tensorflow.org/api_docs/python/tf/linalg/matmul).
+
+Example:
+
+    a = C.sequence.input_variable((3, 4, 5))     # batch matrix
+    b = C.sequence.input_variable((3, 5, 6))     # batch matrix
+    c = Cx.batchmatmul(a, b)
+    assert c.shape == (3, 4, 6)                  # 3 is treated as a batch axis
+
+
+
+***2019-03-10.***
+#### Added `PretrainedBertEncoder` and `PretrainedBertModel`
+BERT, the state-of-the-art language model is now available as a CNTK pretrained model.
+
+Currently, it is only tested to work with `BERT-Base, Uncased` (uncased_L-12_H-768_A-12) and can be
+downloaded from [Google AI](https://github.com/google-research/bert)
+
+When you have downloaded `BERT-Base, Uncased`, there should be 5 files inside. You will need to `.zip`
+three of those files into a tensorflow checkpoint file before you can load it into `cntkx`.
+
+Those three files are: `bert_model.ckpt.data-00000-of-00001`, `bert_model.ckpt.index`, `bert_model.ckpt.meta`.
+Then rename the extension of `.zip` into `.ckpt` and you are good to go.
+
+Example below
+
+    text_tensor = C.sequence.input_variable(30522)
+    token_type_tensor = C.sequence.input_variable(2)
+    filepath_to_tf_bert_model = "YOUR_FILE_DIRECTORY/bert_model.ckpt"
+
+    model = Cx.layers.PreTrainedBertModel(filepath_to_tf_bert_model, num_heads=12, dropout_rate=0.1)
+    b = model(text_tensor, token_type_tensor)
+
+    assert b.shape == (768,)
+
+For more details about BERT, you can find the original paper [here](https://arxiv.org/abs/1810.04805), 
+and some useful resources [here](https://towardsdatascience.com/bert-explained-state-of-the-art-language-model-for-nlp-f8b21a9b6270) 
+and [here](http://jalammar.github.io/illustrated-bert/).
+
+Note:
+It goes without saying also that to use these pre-trained models you will need to have tensorflow installed
+since we are convert them from tensorflow models.
+
+
+***2019-03-06.***
+#### Added `PositionalEmbedding`, `BertEmbeddings` and `PretrainedBertEmbeddings`
+CNTK implementation of `PositionalEmbedding`, `BertEmbeddings` and tf-to-cntk `PreTrainedBertEmbeddings`.
+BERT is a state-of-the-art language model from Google AI, more details can be found in
+[BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805).
+
+Google AI's pre-trained BERT tensorflow model can be downloaded [here](https://github.com/google-research/bert).
+Tensorflow would need to be installed in your environment if you intend to use `PreTrainedBertEmbeddings`, which
+takes a tensorflow model and convert it cntk.
+
+Example for `PositionalEmbedding`
+
+    a = C.sequence.input_variable(12)
+    b = PositionalEmbedding(max_seq_length, hidden_dim)(a)
+
+    assert b.shape == (hidden_dim, )
+
+Example for `BertEmbeddings`
+
+    text_tensor = C.sequence.input_variable(100)
+    token_type_tensor = C.sequence.input_variable(2)
+    b = BertEmbeddings(max_seq_length, hidden_dim, 0.1)(text_tensor, token_type_tensor)
+
+    assert b.shape == (hidden_dim, )
+
+Example for `PreTrainedBertEmbeddings`
+
+    text_tensor = C.sequence.input_variable(30522)
+    token_type_tensor = C.sequence.input_variable(2)
+    filepath_to_tf_bert_model = "YOURFILEPATH"
+    embeddings = PreTrainedBertEmbeddings(filepath_to_tf_bert_model, 0.1, False)
+    b = embeddings(text_tensor, token_type_tensor)
+
+    assert b.shape == (768, )
+
+
+***2019-03-02.***
+#### Added `VariationalDropout` and `WeightDroppedLSTM`
+CNTK implementation of `Variational Dropout` found in 
+[A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](https://arxiv.org/abs/1512.05287)
+and `Weight Dropped LSTM` proposed in a salesforce research paper 
+[Regularizing and Optimizing LSTM Language Models](https://arxiv.org/abs/1708.02182).
+
+`Weight Dropped LSTM` is a regularised LSTM that uses DropConnect on hidden-to-hidden weights as a form of recurrent
+regularisation. It also include application of variational dropout on the inputs and outputs of the recurrent units
+for further regularisation.
+
+`Variational Drpoout` is a regularisation that uses same dropout mask at each time step 
+(i.e. across the dynamic sequence axis) as opposed to the naive application of `C.layers.Dropout` to a sequence
+which will result in a different dropout mask for every tensor along the sequence axis.
+
+
+    import cntkx as Cx
+    from cntkx.layers import Recurrence, WeightDroppedLSTM
+    import cntk as C
+
+    dropconnect_rate = 0.2
+    variationaldrop_rate = 0.1
+
+    seq = C.sequence.input_variable(56)
+    b = Recurrence(WeightDroppedLSTM(20, dropconnect_rate),
+                   variational_dropout_rate_input=variationaldrop_rate,
+                   variational_dropout_rate_output=variationaldrop_rate)(seq)
+
+    assert b.shape == (100, )
+
+    seq_dropped = VariationalDropout(0.1)(seq)
+
+    assert seq_dropped.shape == seq.shape
+
+
+***2019-02-02.***
+#### Added Gated Linear Unit / Gated CNN
+CNTK implementation of Gated Linear Unit (Gated CNN) founnd in Facebook AI Research Lab's paper:
+[Language Modeling with Gated Convolutional Networks](https://arxiv.org/abs/1612.08083).
+This paper applies a convolutional approach to language modelling with a novel Gated-CNN model.
+
+    import cntkx as Cx
+    import cntk as C
+
+    seq = C.sequence.input_variable(56)
+    hidden = Cx.layers.GatedLinearUnit(window=2, hidden_dim=100)(seq)
+
+    assert hidden.shape == (100, )
+
+
+***2019-01-21.***
+#### Added `Focal Loss` for multi-class and binary classification
+CNTK implementation of `Focal Loss` enables the training of highly accurate dense object detectors in the
+presence of vast numbers of easy background examples or dataset with extreme class imbalance (e.g. 1:1000).
+
+`Focal Loss` focuses training on a sparse set of hard examples and prevents the vast number of easy negatives from
+overwhelm-ing the model during training. 
+
+For more details please refer to [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002)
+
+    import cntkx as Cx
+
+    Cx.focal_loss_with_softmax([[0, 0, 0.8, 0.2]], [[0, 0, 1, 0]]).eval()
+    array([[0.31306446]], dtype=float32)
+
+
+
+***2019-01-18.***
+#### Added Gaussian Window Attention Model
+Gaussian window attention model was first introduced by Alex Graves in 
+"Generating sequences with recurrent neural networks".
+
+It uses a mixture of gaussian windows to attend to 
+portions of the sequence as oppose to the widely used attention model introduced in 
+"Neural machine translation by jointly learning to align and translate" by Bahdanau, et al. that attends
+to the entire sequence all at once.
+
+Gaussian window attention is also directional in its attention on the context sequence. When modeling
+strongly ordered sequences, gaussian window attention will be a natural choice due to this inductive bias.
+
+    import cntk as C
+    import cntkx as Cx
+
+    seq1 = C.Axis.new_unique_dynamic_axis('seq1')
+    seq2 = C.Axis.new_unique_dynamic_axis('seq2')
+
+    encoded = C.sequence.input_variable(30, sequence_axis=seq1)
+    query = C.sequence.input_variable(28, sequence_axis=seq2)
+
+    a = Cx.layers.GaussianWindowAttention(10)(encoded, query)
+
+    assert a.shape == (30, )
+
+"Generating sequences with recurrent neural networks" can be found [here](https://arxiv.org/abs/1308.0850).
+"Neural machine translation by jointly learning to align and translate" can be found [here](https://arxiv.org/abs/1409.0473).
+
+***2019-01-16.***
+#### Added Spatial Pyramid Pooling layer
+Spatial pyramid pooling layer is a pooling layer than returns a fixed length representation regardless of the 
+image size/scale. It is frequently used for multi-size image training. It reported SOTA classification results using
+a single full-image representation without fine-tuning. For more details on the paper
+"Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition" by K. He, X. Zhang, S. Ren, J. Sun,
+link [here](https://arxiv.org/abs/1406.4729).
+
+    import cntk as C
+    import cntkx as Cx
+
+    n = np.random.random((3, 3, 32, 32)).astype(np.float32)
+    a = C.input_variable((3, 32, 32))
+    b = Cx.layers.SpatialPyramidPooling((1, 2, 4))(a)
+
+    assert b.shape == (3 * (4 * 4 + 2 * 2 + 1),)  # representation not dependent on image size
+
+
+***2019-01-15.***
+#### Added Sinusoidal Positional Embedding and `cntkx.ops.erf`
+Added sinusoidal positional embedding used in [Transformer](https://arxiv.org/abs/1706.03762). For an accessible
+explanation of transformer, you may look up [here](http://jalammar.github.io/illustrated-transformer/).
+
+    import cntk as C
+    import cntkx as Cx
+
+    a = C.sequence.input_variable(10)
+    b = SinusoidalPositionalEmbedding()(a)
+
+    assert b.shape == (10, )
+
+Added `cntkx.ops.erf` error function.
+
+***2019-01-12.***
+#### Added Vision models: VGG16, VGG19 and UNET
+VGG is for image classification and UNET is for semantic segmentation. VGG is implemented for completeness 
+sake and should not be used for any serious classification task.
+
+
+Paper on VGG can be found [here](https://arxiv.org/abs/1409.1556) titled "Very Deep Convolutional Networks 
+for Large-Scale Image Recognition"
+
+Paper for UNET can be found [here](https://arxiv.org/abs/1505.04597) titled "U-Net: Convolutional Networks 
+for Biomedical Image Segmentation"
+
+VGG example:
+
+    import cntk as C
+    import cntkx as Cx
+
+    a = C.input_variable((3, 64, 64))
+    b = Cx.layers.VGG19(100)(a)
+
+    assert b.shape == (100,)
+
+UNET example:
+
+    import cntk as C
+    import cntkx as Cx
+
+    a = C.input_variable((3, 512, 512))
+    b = Cx.layers.UNET(num_classes=10, base_num_filters=64, pad=True)(a)
+
+    assert b.shape == (10, 512, 512)
+
+Convenience functions such as `cntkx.ops.upsample` and `centre_crop` have also been added.
+`cntkx.ops.upsample` upsamples an image twice on each spatial dim. `centre_crop` crops a smaller image from
+a bigger one in the centre given a reference smaller image.
+
+
+#### Added Transformer attention model and associated components
+The Transformer was first introduced in the [paper](https://arxiv.org/abs/1706.03762) 'Attention is all you need'.
+The architecture is based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.
+More recently, [BERT](https://arxiv.org/abs/1810.04805) which broke almost all SOTA language task is also based on 
+transformer and self-attention.
+
+    import cntk as C
+    import cntkx as Cx
+
+    a = C.sequence.input_variable(512)
+    b = C.sequence.input_variable(512)
+
+    transformer = Cx.layers.Transformer()  # using default settings
+    decoded = transformer(a, b)
+
+    assert decoded.shape == (512, )
+
+
+***2018-12-08.***
+#### Added QRNN: Quasi-Recurrent Neural Network (QRNN) and `cntkx.ops.cumsum`
+The QRNN provides similar accuracy to the LSTM but can be betwen 2 and 17 times faster than the 
+highly optimized NVIDIA cuDNN LSTM implementation depending on the use case.
+
+More details please refer to the original paper [here](https://arxiv.org/abs/1611.01576).
+
+    import cntk as C
+    import cntkx as Cx
+
+    input_seq = C.sequence.input_variable(input_dim)
+    prediction_seq = Cx.layers.QRNN(hidden_dim=50)(input_seq)
+
+
+
+***2018-12-07.***
+#### New sequence ops: `cntkx.ops.sequence.pad` and `cntkx.ops.sequence.length`
+Added two new sequence ops. `cntkx.ops.sequence.pad` allows padding on the sequence axis and 
+`cntkx.ops.sequence.length` calculates the length of the sequence.
+
+***2018-12-05.***
+#### Mixture Density Network
+Mixture density networks are neural networks that can in principle represent arbitrary conditional 
+probability distributions in the same way that a conventional neural network can represent arbitrary functions.
+MDN are very useful when you need to map an input to several correct targets (aka. one-to-many problem).
+
+Updated with Gaussian Mixture Density Network ops and loss function. Ops will allow you to extract mdn coefficients and sample from the network.
+
+More details on mdn can be found in this [paper](https://publications.aston.ac.uk/373/1/NCRG_94_004.pdf) by Christopher Bishop.
+
+    import cntk as C
+    import cntkx as Cx
+
+    input_tensor = C.input_variable(1, name="input_tensor")
+    target_tensor = C.input_variable(1, name="target_tensor")
+
+    # model
+    inner = Dense(50, activation=C.relu)(input_tensor)
+    inner = Dense(50, activation=C.relu)(inner)
+    prediction_tensor = Dense((ndim + 2) * nmix, activation=None)(inner)
+
+    sampled = Cx.sample_gaussian_mdn(prediction_tensor, nmix, ndim)  # sampling node
+    loss = Cx.gaussian_mdn_loss(prediction_tensor, target_tensor, nmix=nmix, ndim=ndim)  # loss function
+
+
+
+%package -n python3-cntkx
+Summary:	Extension library of Microsoft Cognitive Toolkit
+Provides:	python-cntkx
+BuildRequires:	python3-devel
+BuildRequires:	python3-setuptools
+BuildRequires:	python3-pip
+%description -n python3-cntkx
+# CNTKx
+CNTKx is a deep learning library that builds on and extends Microsoft Cognitive Toolkit [CNTK](https://github.com/Microsoft/CNTK). 
+Despite the last planned release of cntk 2.7, cntkx will continue to be in active development, more models and pre-built components coming soon!
+
+Feel free to open an issue for any request or a PR to contribute :)
+
+## Installation
+cntkx is written in pure python and cntk is a dependency to it. Please get a working installation of cntk first. Then:
+
+    pip install cntkx
+
+cntkx only works with `python>=3.6`
+
+
+## Available Components
+| ops | Description |
+| --- | ---|
+| `floor_division` | element-wise floor_division |
+| `remainder` | element-wise remainder of division |
+| `scalar` | cast tensor to scalar (1,) |
+| `cumsum` | Cumulative summation along axis |
+| `upsample` | Upsample by k factor (for image) |
+| `centre_crop` | Crop centre of image |
+| `swish` | Activation |
+| `mish` | Activation |
+| `hardmax` | Activation |
+| `erf` | Error function |
+| `gelu` | Gaussian Error Linear Unit function |
+| `gelu_fast` | fast approximation of Gaussian Error Linear Unit function |
+| `sequence.pad` | Pad at start or end of sequence axis |
+| `sequence.pad_to` | Pad a sequence to have the same length as another sequence |
+| `sequence.length` | length of sequence |
+| `sequence.position` | position of every sequence element |
+| `sequence.stride` | strides across sequential axis  |
+| `sequence.join` | joins two sequence along their sequential axis  |
+| `sequence.window` | creates sliding window along the sequence axis  |
+| `sequence.window_causal` | creates causal sliding window along the sequence axis  |
+| `sequence.reverse` | reverses the items along the dynamic sequence axis  |
+| `sequence.reduce_mean` | calculates the mean along the dynamic sequence axis  |
+| `sequence.pad_ctc_labels` | padded ctc labels to be the same sequence length as the network output  |
+| `sequence.reduce_concat_pool` | drop-in replace for sequence.last  |
+| `random.sample` | Samples an unnormalised log probability distribution |
+| `random.sample_with_bias` | Samples an unnormalised log probability distribution over-weighted to more probable classes |
+| `random.sample_top_k` | Samples from the top_k of an unnormalised log probability distribution |
+| `batchmatmul` | Batch Matrix Multiplication on a static batch axis, similar to tf.matmul |
+
+| Layers | Description |
+| --- | ---|
+| `QRNN` | Quasi-Recurrent Neural Network |
+| `Recurrence` | With option to apply `VariationalDroppout` |
+| `PyramidalBiRecurrence` | Pyramidal bi-directional recurrence |
+| `VariationalDropout` | Single binary dropout mask for entire sequence |
+| `SinusoidalPositionalEmbedding` | Non-learnable positional embedding (no max sequence length) |
+| `PositionalEmbedding` | Learnable Positional Embedding (used in BERT) |
+| `BertEmbeddings` | BERT Embeddings (word + token_type + positional) |
+| `BertPooler` | Pooler used in BERT |
+| `SpatialPyramidPooling` | Fixed pooled representation regardless of image input size |
+| `GatedLinearUnit` | Gated Convolutional Neural Network |
+| `ScaledDotProductAttention` | Attention used in BERT and Transformer (aka 'attention is all you need') |
+| `MultiHeadAttention` | Attention used in BERT and Transformer (aka 'attention is all you need') |
+| `GaussianWindowAttention` | Windowed attention instead of conventional attention where everything is attended at the same time |
+| `SequentialDense` | Applies Dense to a window of sequence item along sequence axis |
+| `SequentialMaxPooling` | Max pool across sequential axis and static axes |
+| `SequentialAveragePooling` | Average pool across sequential axis and static axes |
+| `SequentialConcatPooling` | Concat Average and Mean pool across sequential axis and static axes |
+| `vFSMN` | Vectorised Feedforward Sequential Memory Networks |
+| `cFSMN` | Compact Feedforward Sequential Memory Networks |
+| `BiRecurrence` | BiRecurrence recurrent layer with weight tying option to half parameter requirement |
+| `GlobalConcatPooling` | Global spatial concat pooling of ave and mean |
+|`FilterResponseNormalization`| Drop in replacement for batch norm with superior performance |
+|`Boom`| More parametrically efficient alternative to Position-Wise FeedForward layer found in Transformer |
+|`GaussianAttentionSeqImage`| Memory efficient attention that used 2d gaussian filters for images |
+| `SequenceDropout` | Dropout entire sequence elements |
+| `SEBlock` | Squeeze and Excitation block |
+| `SequenceSEBlock` | Squeeze and Excitation block for variable width image sequence |
+| `SIREN` | Sinusoidal Representation Network |
+| `LinearAttention` | Linearised form of dot-product attention with linear memory complexity |
+| `LinearAttentionModel` | Wrapper to `LinearAttention` |
+
+
+| Blocks | Description |
+| --- | ---|
+| `WeightDroppedLSTM` | A form of regularised LSTM |
+| `IndyLSTM` | A parameter efficient form of LSTM |
+| `IndRNN` | a RNN with long memory and can be stacked deeply |
+
+| Loss | Description |
+| --- | ---|
+| `gaussian_mdn_loss` | loss function when using Mixture density network |
+| `focal_loss_with_softmax` | A kind of cross entropy that handles extreme class imbalance |
+| `cross_entropy_with_softmax` | Added `label smoothing regularisation` in cross entropy with softmax |
+| `generalised_robust_barron_loss` | generalised robust loss |
+
+| Models | Description |
+| --- | ---|
+| `VGG` | Image Classification |
+| `UNET` | Semantic Segmentation |
+| `Transformer` | Language Modelling |
+| `MDN` | Mixture Density Networks | 
+
+
+| Pre-trained models | Description |
+| --- | ---|
+| `Bert` | Bidirectional Encoder Representations from Transformers |
+| [fwd_wt103.hdf5](https://1drv.ms/u/s!AjJ4XyC3prp8mItNxiawGK4gD8iMhA?e=wh7PLB) | The weight parameters of the fastai's pytorch model. To be used to initialise `PretrainedWikitext103LanguageModel` |
+| [fwd_wt103.cntk](https://1drv.ms/u/s!AjJ4XyC3prp8mItPBdfmDYr9QP7J4w?e=k1BXlW) | The converted cntk model of fastai's pytorch model. To be used with `C.load_model` |
+| [fwd_wt103.onnx](https://1drv.ms/u/s!AjJ4XyC3prp8mItO70T_q8HOPwa6aQ?e=h2Fiv5) | The converted ONNX model of fastai's pytorch model. |
+
+
+| Learners | Description |
+| --- | ---|
+| `CyclicalLearningRate` | a method to eliminate the need to find best value and schedule for learning rate |
+| `RAdam` | a variant of `Adam` that doesn't require any warmup |
+
+| Misc | Description |
+| --- | ---|
+| `CTCEncoder` | Helper class to convert data into a format acceptable for cntk's ctc implementation |
+
+
+## C# CNTK Tutorials
+This library is implemented in pure cntk python API. For help in cntk c#, you can refer to the two repository 
+[deep-learning-with-csharp-and-cntk](https://github.com/anastasios-stamoulis/deep-learning-with-csharp-and-cntk) 
+and [DeepBelief_Course4_Examples](https://github.com/AllanYiin/DeepBelief_Course4_Examples). 
+
+
+## F# CNTK
+For the F# wrapper of CNTK please visit [FsCNTK](https://github.com/fwaris/FsCNTK), 
+it also contains some example implementations like seq2seq, autoencoder, LSTM, GAN.
+
+
+## News
+***2021-04-04***
+### Added new kernel method into `LinearAttention`
+Previous `LinearAttention` uses implemntation from [Transformers are RNNs:Fast Autoregressive Transformers with Linear Attention](https://arxiv.org/abs/2006.16236) that uses `elu`.
+`elu` as kernel may not perform well in certain task (e.g. ASR) as documented in the paper.
+
+Hence, new kernel trick uses `softmax` from [Efficient Attention: Attention with Linear Complexities](https://arxiv.org/abs/1812.01243).
+
+
+
+***2020-09-06***
+### Added `sequence.pad_ctc_labels`
+Pads the ctc label sequence to the same sequence length as the network output.
+This should be used when the final sequence length of the network output cannot be determined
+beforehand during the pre-processing of the ctc_labels. Thus, the padding is done during training runtime
+instead of during the data pipeline processing.
+
+The padding token would be the last sequence element of `ctc_labels`. `ctc_labels` should be
+a one hot encoded vector sequence. The padding token will have the value of 1 in its one-hot encoded vector.
+
+
+***2020-06-20***
+### Added `LinearAttention` and `LinearAttentionModel`
+Added cntk implementation of `LinearAttention` and `LinearAttentionModel`.
+Standard dot-product attention found in `Transformer` is quadratic in time and gpu memory complexity
+with respect to sequence length. The `C.layers.AttentionModel` is also quadratic.
+
+However, `LinearAttention` is a linearised form of dot-product attention. It is linear in time and gpu memory complexity
+with respect to sequence length. This is especially significant since `cntk` doesn't have any checkpointing function
+compared to other frameworks like `tensorflow` and `pytorch`, where it saves gpu memory at the expense of some
+computation overhead. Now, with `LinearAttention` training of `Transformer` can be done on `cntk` since the
+ gpu memory requirement is drastically reduced.
+
+`LinearAttentionModel` is wrapper for `LinearAttention` in the style of `C.layers.AttentionModel`.
+
+For more details refer to [Transformers are RNNs:Fast Autoregressive Transformers with Linear Attention](https://arxiv.org/abs/2006.16236) by Katharopoulos et al.
+
+Example:
+
+    a = C.sequence.input_variable(24)
+    b = LinearAttention(hidden_dim=32, model_dim=24)(a, a, a)
+
+    assert b.shape == (32, )
+
+
+***2020-06-20***
+### Added `sequence.pad_to`
+`sequence.pad_to` pads a shorter sequence to the same sequence length as another longer sequence.
+This is especially useful in CTC training where both sequences need to have the same sequence length
+
+Example:
+
+    ax1 = C.Axis.new_unique_dynamic_axis('ax1')
+    ax2 = C.Axis.new_unique_dynamic_axis('ax2')
+    a = C.sequence.input_variable(3, sequence_axis=ax1)
+    b = C.sequence.input_variable(6, sequence_axis=ax2)
+
+    c = Cx.sequence.pad_to(a, b)  # pad to same sequence length
+
+    ctc_token = C.Constant(np.array([0, 0, 1]))
+    d = C.element_select(C.equal(c, 0), ctc_token, c)  # swap padded zeros with ctc token
+
+
+***2020-06-19***
+### Added `SIREN`
+`SIREN` leverages periodic activation functions for implicit neural representations and demonstrate
+that these networks are ideally suited for representing complex natural signals and their derivatives.
+
+The results are incredible. It is highly recommended that you check out their [project page](https://vsitzmann.github.io/siren/).
+
+More details can be found in their paper [Implicit Neural Representations with PeriodicActivation Functions](https://arxiv.org/abs/2006.09661)
+
+
+
+***2020-05-01***
+### Added `SEBlock` and `SequenceSEBlock`
+`SEBlock` and `SequenceSEBlock` are squeeze-excitation blocks that adaptively recalibrates channel-wise feature
+responses by explicitly modelling interdependencies between channels. It was show that these blocks can be
+stacked together to form SENet architectures that generalise extremely effectively across different datasets.
+
+Its the winner for ILSVRC2017 classification submission. More details can be found in the
+paper [Squeeze-and-Excitation Networks](https://arxiv.org/abs/1709.01507) by Jie hu et al.
+
+
+***2020-04-20***
+### Added `SequenceDropout`
+`SequenceDropout` dropouts entire sequence elements along the dynamic sequence axis.
+
+
+
+***2020-04-15***
+### Added `GaussianAttentionSeqImage`
+`GaussianAttentionSeqImage` is a 2d spatial gaussian attention.
+To use, the encoded image should be formulated as a cntk sequence (example below). This can be useful when 
+you are constraint by gpu memory as 2d gaussian attention is more memory efficient than standard attention.
+
+This is from the deepmind paper, DRAW: A Recurrent Neural Network for Image Generation by Gregor et al
+More details can be found in the following https://arxiv.org/abs/1502.04623
+
+Example:
+
+    n = 5
+    num_channels = 3
+    image_height = 64
+    expected_image_width = 1000
+    image_seq = C.sequence.input_variable((num_channels, image_height))  # rgb image with variable width and fixed height
+    decoder_hidden_state = ...  # from decoder somewhere in the network
+    attended_image = Cx.layers.GaussianAttentionSeqImage(n, image_height, expected_image_width)(image_seq, decoder_hidden_state)
+
+    assert attended_image.shape == (num_channels, n, n)
+
+
+***2020-03-29.***
+#### Added `generalised_robust_barron_loss` and `sample_with_bias`
+`generalised_robust_barron_loss` is a generalisation for generalization of the Cauchy/Lorentzian,
+Geman-McClure, Welsch/Leclerc, generalized Charbonnier, Charbonnier/pseudo-Huber/L1-L2, and
+L2 loss functions.
+
+Can be used as a drop-in replacement in any regression task that you have.
+
+For more details, please refer to [A General and Adaptive Robust Loss Function](https://arxiv.org/abs/1701.03077), Jonathan T. Barron,
+or this [video](https://www.youtube.com/watch?v=BmNKbnF69eY).
+It is the Best Paper Award Finalist in CVPR 2019.
+
+
+Implemented `sample_with_bias` to sample more likely classes as a replacement 
+for `sample_top_k` which cannot be used inside a `UnfoldFrom`. `sample_with_bias` reduces sampling variance especially when you have a long tail.
+
+
+
+***2020-02-11.***
+#### Added `Boom`
+Boom layer from SHA-RNN by S. Merity creator of QRNN. Alternative to PositionwiseFeedForward. Serves the same function as
+PositionwiseFeedForward found in transformer.
+
+Boom layer minimizes computation and removes an entire matrix of parameters compared to traditional down-projection layers.
+
+For more detail please read: [Single Headed Attention RNN: Stop Thinking With Your Head](https://arxiv.org/abs/1911.11423) by Stephen Merity.
+
+
+***2019-12-03.***
+#### Added `FilterResponseNormalization` and `ThresholdedLinearUnit`
+Added cntk implementation of `FilterResponseNormalization`. Filter Response Normalization (FRN) layer 
+is a novel combination of a normalization and an activation function,
+that can be used as a drop-in replacement for other normalizations and activations.
+
+The method operates on each activation map of each batch sample
+independently, eliminating the dependency on other batch samples or channels of the same sample.
+The method outperforms BN and all alternatives in a variety of settings for all batch sizes.
+FRN layer performs â‰ˆ0.7âˆ’1.0% better on top-1 validation accuracy than BN with large mini-batch sizes on
+Imagenet classification on InceptionV3 and ResnetV2-50 architectures. Further, it performs >1% better
+than GN on the same problem in the small mini-batch size regime. For object detection problem on COCO dataset,
+FRN layer outperforms all other methods by at least 0.3âˆ’0.5% in all batch size regimes.
+
+Please refer to the paper [Filter Response Normalization Layer: Eliminating Batch Dependence 
+in the Training of Deep Neural Networks](https://arxiv.org/abs/1911.09737v1) for more details .
+
+
+
+***2019-11-04.***
+#### Added `ops.floor_division`, `ops.remainder`, `sequence.window_causal` and `SequentialDense`
+Add in new operations stated above. `sequence.window` now has an additional argument that lets you control striding.
+`sequence.window_causal` creates causal window that doesn't leak future information into the past (preserve causality).
+`SequentialDense` convenience layer added to apply dense to a window of sequence item,
+much like `SequentialConvolution` but with better memory performance.
+
+
+
+***2019-11-04.***
+#### Added `SequentialConcatPooling`, `Cx.Sequence.reduce_concat_pool` and `GlobalConcatPooling`
+`Cx.Sequence.reduce_concat_pool` concatenates the last item in the sequence axis with the summarisation
+of the sequence represented by `reduce_max` and `reduce_mean` of the sequence axis. Anytime `C.sequence.last` is used,
+this can be a drop-in replacement.
+
+Example:
+
+    n = 32
+    a = C.sequence.input_variable(n)
+    b = Cx.sequence.reduce_concat_pool(a)
+
+    assert b.shape == (n * 3, )
+
+
+`SequentialConcatPooling` does spatial concat pooling over the sequential axis.
+Concat pooling is the concatenation of both average pooling and max pooling. In any situation where max or ave 
+pooling is appropriate, concat pooling can be used as a drop-in replacement and achieve improvements in performance.
+
+Example:
+
+    a = C.sequence.input_variable((3, 10))
+    b = SequentialConcatPooling(filter_shape=(2, 2), strides=2)(a)
+
+    assert b.shape == (6, 10)
+
+    n = [np.random.random((3, 10)).astype(np.float32),
+         np.random.random((3, 10)).astype(np.float32),
+         np.random.random((3, 10)).astype(np.float32),
+         np.random.random((3, 10)).astype(np.float32), ]
+
+    print(b.eval({a: n}))
+
+
+`GlobalConcatPooling` is the standard spatial concat pooling of both max pool and ave pool.
+
+
+***2019-10-15.***
+#### Added `mish` activation function and `RAdam` learner.
+`mish` is an activation function is introduced in [Mish: A Self Regularized Non-Monotonic Neural Activation Function](https://arxiv.org/abs/1908.08681v2)
+by Diganta Misra. Experiments show that `mish` tends to work better than both ReLU and Swish along with other standard
+activation functions in many deep networks across challenging datasets. For instance,
+in Squeeze Excite Net-18 for CIFAR 100 classification, the network with Mish had an increase in
+Top-1 test accuracy by 0.494% and 1.671% as compared to the same network with Swish and ReLU respectively.
+The similarity to Swish along with providing a boost in performance and its simplicity in implementation
+makes it easier for researchers and developers to use Mish in their Neural Network Models.
+
+This activation function is adopted in `fast ai`.
+
+Rectified Adam or `RAdam` is a `Adam` optimiser variant that doesn't require a warmup schedule, which `Adam` tends
+to need to maintain stability. In this cntk implementation, we added a `RAdam` like optimiser based
+on the work of [On the adequacy of untuned warmup for adaptive optimization](https://arxiv.org/abs/1910.04209) by Jerry Ma and Denis Yarats.
+
+`RAdam` is adopted in `fast ai` too.
+
+
+
+***2019-09-30.***
+#### Added `BiRecurrence` with weight tying
+Made improvement to weight tying of BiRecurrence by have one parameter tensor token for every state 
+in the step function per direction (forward and backward). This will allow forward and backward token
+to have more representational flexibility. Previously, all states use the same forward or backward token.
+
+
+***2019-09-06.***
+#### Added `BiRecurrence` with weight tying
+Add a wrapper to create a bidirectional recurrent layer using `BiRecurrence`. Included in the implementation
+is an option to half the number of parameters required by  bidirectional recurrent layer. This is done
+by only using one recurrent unit to do both forward and backward computation instead of the usual two.
+A forward and backward token is used to initialise the hidden state so that the recurrent unit can tell
+the directionality.
+
+More details can be found in the paper [Efficient Bidirectional Neural Machine Translation](https://arxiv.org/abs/1908.09329)
+
+Example:
+
+    a = C.sequence.input_variable(10)
+    b = BiRecurrence(LSTM(100), weight_tie=True)(a)
+
+    assert b.shape == (200, )
+
+
+***2019-08-29.***
+#### Added `PretrainedWikitext103LanguageModel`
+CNTK implementation of Fast AI's Universal  Language  ModelFine-tuning (ULMFiT) English model has been added.
+This language model was trained on Wikitext-103 and can be used as a base model for any downstream language task.
+
+It is also much more efficient to run compare to BERT and other Transformer language models.
+
+For more details, you can refer to the original paper [here](https://arxiv.org/abs/1801.06146)
+
+Example:
+
+    vocab_size = 238462
+    converted_hdf5_model_file_path = 'PATH/fwd_wt103.hdf5'  # this is not the original pytorch model
+    lm = PretrainedWikitext103LanguageModel(converted_hdf5_model_file_path)
+
+    a = C.sequence.input_variable(vocab_size)
+    prediction = lm(a)  # next-word-prediction
+    features = prediction.features  # features of tokens
+
+    assert prediction.shape == (vocab_size, )
+    assert features.shape == (400, )
+
+| Model | Description |
+| --- | ---|
+| [fwd_wt103.hdf5](https://1drv.ms/u/s!AjJ4XyC3prp8mItNxiawGK4gD8iMhA?e=wh7PLB) | The weight parameters of the fastai's pytorch model. To be used to initialise `PretrainedWikitext103LanguageModel` |
+| [fwd_wt103.cntk](https://1drv.ms/u/s!AjJ4XyC3prp8mItPBdfmDYr9QP7J4w?e=k1BXlW) | The converted cntk model of fastai's pytorch model. To be used with `C.load_model` |
+| [fwd_wt103.onnx](https://1drv.ms/u/s!AjJ4XyC3prp8mItO70T_q8HOPwa6aQ?e=h2Fiv5) | The converted ONNX model of fastai's pytorch model. |
+| [itos_wt103.pkl](http://files.fast.ai/models/wt103/) | Tokens used in pretrained model |
+
+
+***2019-08-08.***
+#### Added `cntkx.ops.gelu` and `cntkx.ops.gelu_fast`
+Added two cntk implementation of `gelu` activation function. `gelu` activation is used in `BERT`
+and OpenAI's `GPT` and `GPT-2`.
+
+Gaussian Error Linear Unit (GELU), a high-performing neuralnetwork activation function.
+The GELU nonlinearity is the expected transforma-tion of a stochastic regularizer which randomly
+applies the identity or zero mapto a neuronâ€™s input.  The GELU nonlinearity weights inputs by their
+magnitude,rather than gates inputs by their sign as in ReLUs.
+
+For more detail please refer to [Gaussian Error Linear Units (GELU)](https://arxiv.org/abs/1606.08415)
+by Hendrycks and Gimpel.
+
+
+***2019-07-04.***
+#### Added `cntkx.ops.sequence.reduce_mean`
+Calculates the mean along the dynamic sequential axis in CNTK.
+
+
+***2019-06-26.***
+#### Added `cntkx.ops.sequence.reverse`
+Allows the sequence items along the sequence axis to be reversed. This is useful when you want to create a 
+Bi-directional Auto-Regressive layer because using UnfoldFrom does not work with Recurrence(go_backwards=True) and
+ will result in a ValueError.
+
+
+Example:
+
+    import cntk as C
+    import cntkx as Cx
+    from cntk.layers import Recurrence, UnfoldFrom, LSTM
+
+    hidden_dim = 50
+    start_token = C.Constant(0, shape=(hidden_dim,))
+    a = C.sequence.input_variable(1, name='seq1')
+
+    b = UnfoldFrom(Recurrence(LSTM(hidden_dim), go_backwards=True))(start_token, a)
+
+    n = [np.random.random((10, hidden_dim)).astype(np.float32),]
+
+    # This raise 'ValueError: It is not allowed to have multiple different stepping directions in the same loop'
+    b.eval({b.arguments[0]: n})
+
+
+The workaround would be:
+
+    import cntk as C
+    import cntkx as Cx
+    from cntk.layers import Recurrence, UnfoldFrom, LSTM
+
+    hidden_dim = 50
+    start_token = C.Constant(0, shape=(hidden_dim,))
+    a = C.sequence.input_variable(1, name='seq1')
+    a_reversed = Cx.sequence.reverse(a)
+
+    b = UnfoldFrom(Recurrence(LSTM(hidden_dim)))(start_token, a_reversed)  # remove go_backwards=True
+
+    n = [np.random.random((10, hidden_dim)).astype(np.float32),]    
+    b.eval({b.arguments[0]: n})  # this will run just fine
+
+
+
+***2019-06-21.***
+#### Added `cntkx.ops.random.sample_top_k`
+CNTK implementation of that allows sampling of the top_k unnormalised log-probabilities distribution.
+This is useful text (or sequence) generation where it is known that greedy decoding will cause
+text degeneration.
+
+For more details on this, please refer to [A curious case of neural text degeneration](https://arxiv.org/abs/1904.09751)
+
+
+Example:
+
+    import cntk as C
+    import cntkx as Cx
+
+    a = C.input_variable(5)
+    b = Cx.random.sample_top_k(a, k=3, num_classes=5)
+
+    n = np.array([[1, 2, 3, 4, 5],] * 1000)
+
+    results = b.eval({a: n})
+    assert np.sum(results[:, :2]) == 0
+    assert np.sum(results[:, 2:]) == 1000
+
+***2019-05-04.***
+#### Added `cntkx.layers.vFSMN` and `cntkx.layers.cFSMN`
+CNTK implementation of Bidirectional vectorised Feedforward Sequential Memory Network (vFSMN)
+and Compact Feedforward Sequential Memory Network (cFSMN).
+
+FSMN is a standard fully-connected feedforward neural network equipped
+with some learnable memory blocks in its hidden layers. The memory blocks
+use a tapped-delay line structure to encode the long context information into
+a fixed-size representation as short-term memory mechanism.
+
+The authors claim that FSMNs can be learned much more reliably and faster than
+RNNs or LSTMs due to the inherent non-recurrent model structure while significantly
+outperforming RNNs in language and speech modeling.
+
+For more details please refer to [Feedforward Sequential Memory Networks: A 
+New Structure to Learn Long-term Dependency](https://arxiv.org/abs/1512.08301)
+
+cFSMN is a compact version of FSMN that can result in a reduction of up
+to 60% in model size and speed up the learning by more than 7 times while
+still significantly outperforming the popular bi-direction LSTMs for both
+frame-level cross-entropy (CE) criterion based training and MMI based sequence training.
+
+For more details please refer to "Compact Feedforward Sequential Memory Networks for
+Large VocabularyContinuous Speech Recognition" by Zhang, et al.
+
+Example:
+
+    import cntk as C
+    from cntkx.layers import vFSMN, cFSMN
+
+    # unidirectional vFSMN (only past conext used)
+    a = C.sequence.input_variable(10)
+    b = vFSMN(100, C.relu, num_past_context=3, num_future_context=0)(a)
+
+    assert b.shape == (100,)
+
+    # bidirectional vFSMN (enable both past and future context)
+    a = C.sequence.input_variable(10)
+    b = vFSMN(120, C.relu, num_past_context=3, num_future_context=3)(a)
+
+    assert b.shape == (120,)
+
+    # bidirectional cFSMN (enable both past and future context)
+    a = C.sequence.input_variable(100)
+    b = cFSMN(120, 50, C.relu, num_past_context=3, num_future_context=3)(a)
+
+    assert b.shape == (120,)
+
+
+***2019-04-19.***
+#### Added `cntkx.misc.CTCEncoder`
+CNTK's CTC implementation requires that data be formatted in a particular way that's typically in acoustic
+modeling but unusual in other applications. So class provides an easy way to convert data between
+what users typically expect and what cntk demands.
+
+Example:
+    labels = ['a', 'b', 'c']
+    encoder = CTCEncoder(labels)
+
+    labels_tensor = C.sequence.input_variable(len(encoder.classes_))  # number of classes = 4
+    input_tensor = C.sequence.input_variable(100)
+
+    labels_graph = cntk.labels_to_graph(labels_tensor)
+    network_out = model(input_tensor)
+
+    fb = C.forward_backward(labels_graph, network_out, blankTokenId=encoder.blankTokenId)
+
+    ground_truth = ['a', 'b', 'b', 'b', 'c']
+    seq_length = 10  # must be the same length as the sequence length in network_out
+
+    fb.eval({input_tensor: [...],
+             labels_tensor: [encoder.transform(ground_truth, seq_length=seq_length)]})
+
+
+
+***2019-04-14.***
+#### Added `Label Smoothing Regularization`, `seqeuence.window` and `PyramidalBiRecurrence`
+Added `Label Smoothing Regularization` in `cross_entropy_with_softmax`.
+Added `sequence.window` that creates non-overlapping window along the sequence axis thereby reducing the 
+sequence length and increasing the dimension by the same factor.
+
+Implemented a convenience layer used in acoustic modelling known as `PyramidalBiRecurrence`. Used to create
+pyramidal bi-directional LSTM (BLSTM) found in "Listen, attend and spell" by Chan et al. (https://arxiv.org/abs/1508.01211)
+Typically used to down sample the sequence length to make memory and runtime manageable.
+
+
+***2019-04-08.***
+#### Added `cntkx.ops.sequence.join`
+Added a new `op` called `join` where two sequence tensors can be joined along with sequence axis forming a longer sequence.
+
+
+***2019-04-08.***
+#### Added `cntkx.layers.SequentialAveragePooling`
+Add average pooling layer that works with sequential axis. Current cntk `AveragePooling` doesn't pool across sequence elements.
+
+Example on `cntkx.layers.SequentialAveragePooling`
+
+    # rgb image of height 25 and variable width
+    a = C.sequence.input_variable((3, 25))
+
+    # Convolute across image with (3, 3) kernel with stride (1, 1)
+    b = C.layers.SequentialConvolution(filter_shape=(3, 3), num_filters=16, stride=(1, 1), pad=True)(a)
+
+    assert b.shape == (16, 25)
+
+    # max pool (2,2) in height and width with stride (2,2) in height and width, no padding
+    c = SequentialAveragePooling(filter_shape=(2, 2), strides=(2, 2), pad=False)(b)
+
+    assert c.shape == (16, 12)
+
+
+***2019-04-07.***
+#### Added `cntkx.sequence.stride` and `cntkx.ops.scalar`
+`Cx.sequence.stride` enables striding across the sequential axis, selecting every integer items along the sequence.
+`Cx.scalar` converts tensor into scalar of shape `(1,)` 
+
+
+
+***2019-04-06.***
+#### Added `IndyLSTM` and `IndRNN`
+CNTK implementation of [Independently Recurrent Long Short-term Memory cells: IndyLSTMs](https://arxiv.org/abs/1903.08023)
+by Gonnet and Deselaers, and [Independently Recurrent Neural Network (IndRNN): Building A Longer andDeeper RNN](https://arxiv.org/abs/1803.04831)
+by Li, et al.
+
+Both `IndyLSTM` and `IndRNN` have hidden-to-hidden weights that are diagonal matrix instead of the usual full matrix.
+Thus neurons in each layer are independent from each other, and the cross-channel information is 
+obtained through stacking multiple layers.
+
+`IndRNN` allows for the use of `C.relu` activation thus allowing multiple `IndRNN` layers to be stacked together deeply.
+
+`IndyLSTM` has parameters linear to the number of nodes in the linear, as opposed to standard LSTM that is quadratic
+making `IndyLSTM` potentially faster and smaller as a model.
+
+Authors of both `IndRNN` and `IndyLSTM` have claimed performance as good as or even better than regular LSTMs.
+
+Example:
+
+    import cntk as C
+    from cntkx.layers import IndyLSTM, IndRNN, Recurrence
+
+    a = C.sequence.input_variable(10)
+    b = Recurrence(IndRNN(20))(a)
+    c = Recurrence(IndyLSTM(20))(a)
+
+    assert b.shape == c.shape == (20,)
+
+
+
+***2019-03-24.***
+#### Added `cntkx.layers.SequentialMaxPooling`
+Add max pooling layer that works with sequential axis. Current cntk `MaxPooling` doesn't pool across sequence elements.
+
+Example on `cntkx.layers.SequentialMaxPooling`
+
+    # rgb image of height 25 and variable width
+    a = C.sequence.input_variable((3, 25))
+
+    # Convolute across image with (3, 3) kernel with stride (1, 1)
+    b = C.layers.SequentialConvolution(filter_shape=(3, 3), num_filters=16, stride=(1, 1), pad=True)(a)
+
+    assert b.shape == (16, 25)
+
+    # max pool (2,2) in height and width with stride (2,2) in height and width, no padding
+    c = SequentialMaxPooling(filter_shape=(2, 2), strides=(2, 2), pad=False)(b)
+
+    assert c.shape == (16, 12)
+
+
+***2019-03-18.***
+#### Added `cntkx.learners.CyclicalLearningRate`
+Cyclical learning rate (CLR) is an implementation to that  practically eliminates the need to 
+experimentally find the best values and schedule  for  the global  learning  rates.
+
+Instead  of  monotonically decreasing the learning rate, this method lets the learning  
+rate  cyclically  vary  between  reasonable  boundary  values. Training  with  
+cyclical  learning  rates  instead of  fixed  values  achieves improved  classification 
+accuracy without a need to tune and often in fewer iterations.
+
+More details can be found in [Cyclical Learning Rates for Training Neural Networks](https://arxiv.org/abs/1506.01186) 
+by Leslie N. Smith
+
+This CLR implementation can be used with the cntk training loop by adding only ***two lines of code***:
+
+    model = C.layers.Dense(10)(C.input_variable(10))
+    sgd_momentum = C.momentum_sgd(model.parameters, 0.1, 0.9)
+    clr = CyclicalLeaningRate(sgd_momentum, minibatch_size=32)  # first line of code
+
+    for epoch in range(10):
+        for batch in range(100):
+            trainer.train_minibatch(...)
+            clr.batch_step()  # second line of code (to be called after every training update)
+
+
+
+
+***2019-03-12.***
+#### Added `cntkx.ops.batchmatmul`
+Added Batch Matrix Multiplication. This implementation is similar 
+to [tensorflow.matmul](https://www.tensorflow.org/api_docs/python/tf/linalg/matmul).
+
+Example:
+
+    a = C.sequence.input_variable((3, 4, 5))     # batch matrix
+    b = C.sequence.input_variable((3, 5, 6))     # batch matrix
+    c = Cx.batchmatmul(a, b)
+    assert c.shape == (3, 4, 6)                  # 3 is treated as a batch axis
+
+
+
+***2019-03-10.***
+#### Added `PretrainedBertEncoder` and `PretrainedBertModel`
+BERT, the state-of-the-art language model is now available as a CNTK pretrained model.
+
+Currently, it is only tested to work with `BERT-Base, Uncased` (uncased_L-12_H-768_A-12) and can be
+downloaded from [Google AI](https://github.com/google-research/bert)
+
+When you have downloaded `BERT-Base, Uncased`, there should be 5 files inside. You will need to `.zip`
+three of those files into a tensorflow checkpoint file before you can load it into `cntkx`.
+
+Those three files are: `bert_model.ckpt.data-00000-of-00001`, `bert_model.ckpt.index`, `bert_model.ckpt.meta`.
+Then rename the extension of `.zip` into `.ckpt` and you are good to go.
+
+Example below
+
+    text_tensor = C.sequence.input_variable(30522)
+    token_type_tensor = C.sequence.input_variable(2)
+    filepath_to_tf_bert_model = "YOUR_FILE_DIRECTORY/bert_model.ckpt"
+
+    model = Cx.layers.PreTrainedBertModel(filepath_to_tf_bert_model, num_heads=12, dropout_rate=0.1)
+    b = model(text_tensor, token_type_tensor)
+
+    assert b.shape == (768,)
+
+For more details about BERT, you can find the original paper [here](https://arxiv.org/abs/1810.04805), 
+and some useful resources [here](https://towardsdatascience.com/bert-explained-state-of-the-art-language-model-for-nlp-f8b21a9b6270) 
+and [here](http://jalammar.github.io/illustrated-bert/).
+
+Note:
+It goes without saying also that to use these pre-trained models you will need to have tensorflow installed
+since we are convert them from tensorflow models.
+
+
+***2019-03-06.***
+#### Added `PositionalEmbedding`, `BertEmbeddings` and `PretrainedBertEmbeddings`
+CNTK implementation of `PositionalEmbedding`, `BertEmbeddings` and tf-to-cntk `PreTrainedBertEmbeddings`.
+BERT is a state-of-the-art language model from Google AI, more details can be found in
+[BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805).
+
+Google AI's pre-trained BERT tensorflow model can be downloaded [here](https://github.com/google-research/bert).
+Tensorflow would need to be installed in your environment if you intend to use `PreTrainedBertEmbeddings`, which
+takes a tensorflow model and convert it cntk.
+
+Example for `PositionalEmbedding`
+
+    a = C.sequence.input_variable(12)
+    b = PositionalEmbedding(max_seq_length, hidden_dim)(a)
+
+    assert b.shape == (hidden_dim, )
+
+Example for `BertEmbeddings`
+
+    text_tensor = C.sequence.input_variable(100)
+    token_type_tensor = C.sequence.input_variable(2)
+    b = BertEmbeddings(max_seq_length, hidden_dim, 0.1)(text_tensor, token_type_tensor)
+
+    assert b.shape == (hidden_dim, )
+
+Example for `PreTrainedBertEmbeddings`
+
+    text_tensor = C.sequence.input_variable(30522)
+    token_type_tensor = C.sequence.input_variable(2)
+    filepath_to_tf_bert_model = "YOURFILEPATH"
+    embeddings = PreTrainedBertEmbeddings(filepath_to_tf_bert_model, 0.1, False)
+    b = embeddings(text_tensor, token_type_tensor)
+
+    assert b.shape == (768, )
+
+
+***2019-03-02.***
+#### Added `VariationalDropout` and `WeightDroppedLSTM`
+CNTK implementation of `Variational Dropout` found in 
+[A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](https://arxiv.org/abs/1512.05287)
+and `Weight Dropped LSTM` proposed in a salesforce research paper 
+[Regularizing and Optimizing LSTM Language Models](https://arxiv.org/abs/1708.02182).
+
+`Weight Dropped LSTM` is a regularised LSTM that uses DropConnect on hidden-to-hidden weights as a form of recurrent
+regularisation. It also include application of variational dropout on the inputs and outputs of the recurrent units
+for further regularisation.
+
+`Variational Drpoout` is a regularisation that uses same dropout mask at each time step 
+(i.e. across the dynamic sequence axis) as opposed to the naive application of `C.layers.Dropout` to a sequence
+which will result in a different dropout mask for every tensor along the sequence axis.
+
+
+    import cntkx as Cx
+    from cntkx.layers import Recurrence, WeightDroppedLSTM
+    import cntk as C
+
+    dropconnect_rate = 0.2
+    variationaldrop_rate = 0.1
+
+    seq = C.sequence.input_variable(56)
+    b = Recurrence(WeightDroppedLSTM(20, dropconnect_rate),
+                   variational_dropout_rate_input=variationaldrop_rate,
+                   variational_dropout_rate_output=variationaldrop_rate)(seq)
+
+    assert b.shape == (100, )
+
+    seq_dropped = VariationalDropout(0.1)(seq)
+
+    assert seq_dropped.shape == seq.shape
+
+
+***2019-02-02.***
+#### Added Gated Linear Unit / Gated CNN
+CNTK implementation of Gated Linear Unit (Gated CNN) founnd in Facebook AI Research Lab's paper:
+[Language Modeling with Gated Convolutional Networks](https://arxiv.org/abs/1612.08083).
+This paper applies a convolutional approach to language modelling with a novel Gated-CNN model.
+
+    import cntkx as Cx
+    import cntk as C
+
+    seq = C.sequence.input_variable(56)
+    hidden = Cx.layers.GatedLinearUnit(window=2, hidden_dim=100)(seq)
+
+    assert hidden.shape == (100, )
+
+
+***2019-01-21.***
+#### Added `Focal Loss` for multi-class and binary classification
+CNTK implementation of `Focal Loss` enables the training of highly accurate dense object detectors in the
+presence of vast numbers of easy background examples or dataset with extreme class imbalance (e.g. 1:1000).
+
+`Focal Loss` focuses training on a sparse set of hard examples and prevents the vast number of easy negatives from
+overwhelm-ing the model during training. 
+
+For more details please refer to [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002)
+
+    import cntkx as Cx
+
+    Cx.focal_loss_with_softmax([[0, 0, 0.8, 0.2]], [[0, 0, 1, 0]]).eval()
+    array([[0.31306446]], dtype=float32)
+
+
+
+***2019-01-18.***
+#### Added Gaussian Window Attention Model
+Gaussian window attention model was first introduced by Alex Graves in 
+"Generating sequences with recurrent neural networks".
+
+It uses a mixture of gaussian windows to attend to 
+portions of the sequence as oppose to the widely used attention model introduced in 
+"Neural machine translation by jointly learning to align and translate" by Bahdanau, et al. that attends
+to the entire sequence all at once.
+
+Gaussian window attention is also directional in its attention on the context sequence. When modeling
+strongly ordered sequences, gaussian window attention will be a natural choice due to this inductive bias.
+
+    import cntk as C
+    import cntkx as Cx
+
+    seq1 = C.Axis.new_unique_dynamic_axis('seq1')
+    seq2 = C.Axis.new_unique_dynamic_axis('seq2')
+
+    encoded = C.sequence.input_variable(30, sequence_axis=seq1)
+    query = C.sequence.input_variable(28, sequence_axis=seq2)
+
+    a = Cx.layers.GaussianWindowAttention(10)(encoded, query)
+
+    assert a.shape == (30, )
+
+"Generating sequences with recurrent neural networks" can be found [here](https://arxiv.org/abs/1308.0850).
+"Neural machine translation by jointly learning to align and translate" can be found [here](https://arxiv.org/abs/1409.0473).
+
+***2019-01-16.***
+#### Added Spatial Pyramid Pooling layer
+Spatial pyramid pooling layer is a pooling layer than returns a fixed length representation regardless of the 
+image size/scale. It is frequently used for multi-size image training. It reported SOTA classification results using
+a single full-image representation without fine-tuning. For more details on the paper
+"Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition" by K. He, X. Zhang, S. Ren, J. Sun,
+link [here](https://arxiv.org/abs/1406.4729).
+
+    import cntk as C
+    import cntkx as Cx
+
+    n = np.random.random((3, 3, 32, 32)).astype(np.float32)
+    a = C.input_variable((3, 32, 32))
+    b = Cx.layers.SpatialPyramidPooling((1, 2, 4))(a)
+
+    assert b.shape == (3 * (4 * 4 + 2 * 2 + 1),)  # representation not dependent on image size
+
+
+***2019-01-15.***
+#### Added Sinusoidal Positional Embedding and `cntkx.ops.erf`
+Added sinusoidal positional embedding used in [Transformer](https://arxiv.org/abs/1706.03762). For an accessible
+explanation of transformer, you may look up [here](http://jalammar.github.io/illustrated-transformer/).
+
+    import cntk as C
+    import cntkx as Cx
+
+    a = C.sequence.input_variable(10)
+    b = SinusoidalPositionalEmbedding()(a)
+
+    assert b.shape == (10, )
+
+Added `cntkx.ops.erf` error function.
+
+***2019-01-12.***
+#### Added Vision models: VGG16, VGG19 and UNET
+VGG is for image classification and UNET is for semantic segmentation. VGG is implemented for completeness 
+sake and should not be used for any serious classification task.
+
+
+Paper on VGG can be found [here](https://arxiv.org/abs/1409.1556) titled "Very Deep Convolutional Networks 
+for Large-Scale Image Recognition"
+
+Paper for UNET can be found [here](https://arxiv.org/abs/1505.04597) titled "U-Net: Convolutional Networks 
+for Biomedical Image Segmentation"
+
+VGG example:
+
+    import cntk as C
+    import cntkx as Cx
+
+    a = C.input_variable((3, 64, 64))
+    b = Cx.layers.VGG19(100)(a)
+
+    assert b.shape == (100,)
+
+UNET example:
+
+    import cntk as C
+    import cntkx as Cx
+
+    a = C.input_variable((3, 512, 512))
+    b = Cx.layers.UNET(num_classes=10, base_num_filters=64, pad=True)(a)
+
+    assert b.shape == (10, 512, 512)
+
+Convenience functions such as `cntkx.ops.upsample` and `centre_crop` have also been added.
+`cntkx.ops.upsample` upsamples an image twice on each spatial dim. `centre_crop` crops a smaller image from
+a bigger one in the centre given a reference smaller image.
+
+
+#### Added Transformer attention model and associated components
+The Transformer was first introduced in the [paper](https://arxiv.org/abs/1706.03762) 'Attention is all you need'.
+The architecture is based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.
+More recently, [BERT](https://arxiv.org/abs/1810.04805) which broke almost all SOTA language task is also based on 
+transformer and self-attention.
+
+    import cntk as C
+    import cntkx as Cx
+
+    a = C.sequence.input_variable(512)
+    b = C.sequence.input_variable(512)
+
+    transformer = Cx.layers.Transformer()  # using default settings
+    decoded = transformer(a, b)
+
+    assert decoded.shape == (512, )
+
+
+***2018-12-08.***
+#### Added QRNN: Quasi-Recurrent Neural Network (QRNN) and `cntkx.ops.cumsum`
+The QRNN provides similar accuracy to the LSTM but can be betwen 2 and 17 times faster than the 
+highly optimized NVIDIA cuDNN LSTM implementation depending on the use case.
+
+More details please refer to the original paper [here](https://arxiv.org/abs/1611.01576).
+
+    import cntk as C
+    import cntkx as Cx
+
+    input_seq = C.sequence.input_variable(input_dim)
+    prediction_seq = Cx.layers.QRNN(hidden_dim=50)(input_seq)
+
+
+
+***2018-12-07.***
+#### New sequence ops: `cntkx.ops.sequence.pad` and `cntkx.ops.sequence.length`
+Added two new sequence ops. `cntkx.ops.sequence.pad` allows padding on the sequence axis and 
+`cntkx.ops.sequence.length` calculates the length of the sequence.
+
+***2018-12-05.***
+#### Mixture Density Network
+Mixture density networks are neural networks that can in principle represent arbitrary conditional 
+probability distributions in the same way that a conventional neural network can represent arbitrary functions.
+MDN are very useful when you need to map an input to several correct targets (aka. one-to-many problem).
+
+Updated with Gaussian Mixture Density Network ops and loss function. Ops will allow you to extract mdn coefficients and sample from the network.
+
+More details on mdn can be found in this [paper](https://publications.aston.ac.uk/373/1/NCRG_94_004.pdf) by Christopher Bishop.
+
+    import cntk as C
+    import cntkx as Cx
+
+    input_tensor = C.input_variable(1, name="input_tensor")
+    target_tensor = C.input_variable(1, name="target_tensor")
+
+    # model
+    inner = Dense(50, activation=C.relu)(input_tensor)
+    inner = Dense(50, activation=C.relu)(inner)
+    prediction_tensor = Dense((ndim + 2) * nmix, activation=None)(inner)
+
+    sampled = Cx.sample_gaussian_mdn(prediction_tensor, nmix, ndim)  # sampling node
+    loss = Cx.gaussian_mdn_loss(prediction_tensor, target_tensor, nmix=nmix, ndim=ndim)  # loss function
+
+
+
+%package help
+Summary:	Development documents and examples for cntkx
+Provides:	python3-cntkx-doc
+%description help
+# CNTKx
+CNTKx is a deep learning library that builds on and extends Microsoft Cognitive Toolkit [CNTK](https://github.com/Microsoft/CNTK). 
+Despite the last planned release of cntk 2.7, cntkx will continue to be in active development, more models and pre-built components coming soon!
+
+Feel free to open an issue for any request or a PR to contribute :)
+
+## Installation
+cntkx is written in pure python and cntk is a dependency to it. Please get a working installation of cntk first. Then:
+
+    pip install cntkx
+
+cntkx only works with `python>=3.6`
+
+
+## Available Components
+| ops | Description |
+| --- | ---|
+| `floor_division` | element-wise floor_division |
+| `remainder` | element-wise remainder of division |
+| `scalar` | cast tensor to scalar (1,) |
+| `cumsum` | Cumulative summation along axis |
+| `upsample` | Upsample by k factor (for image) |
+| `centre_crop` | Crop centre of image |
+| `swish` | Activation |
+| `mish` | Activation |
+| `hardmax` | Activation |
+| `erf` | Error function |
+| `gelu` | Gaussian Error Linear Unit function |
+| `gelu_fast` | fast approximation of Gaussian Error Linear Unit function |
+| `sequence.pad` | Pad at start or end of sequence axis |
+| `sequence.pad_to` | Pad a sequence to have the same length as another sequence |
+| `sequence.length` | length of sequence |
+| `sequence.position` | position of every sequence element |
+| `sequence.stride` | strides across sequential axis  |
+| `sequence.join` | joins two sequence along their sequential axis  |
+| `sequence.window` | creates sliding window along the sequence axis  |
+| `sequence.window_causal` | creates causal sliding window along the sequence axis  |
+| `sequence.reverse` | reverses the items along the dynamic sequence axis  |
+| `sequence.reduce_mean` | calculates the mean along the dynamic sequence axis  |
+| `sequence.pad_ctc_labels` | padded ctc labels to be the same sequence length as the network output  |
+| `sequence.reduce_concat_pool` | drop-in replace for sequence.last  |
+| `random.sample` | Samples an unnormalised log probability distribution |
+| `random.sample_with_bias` | Samples an unnormalised log probability distribution over-weighted to more probable classes |
+| `random.sample_top_k` | Samples from the top_k of an unnormalised log probability distribution |
+| `batchmatmul` | Batch Matrix Multiplication on a static batch axis, similar to tf.matmul |
+
+| Layers | Description |
+| --- | ---|
+| `QRNN` | Quasi-Recurrent Neural Network |
+| `Recurrence` | With option to apply `VariationalDroppout` |
+| `PyramidalBiRecurrence` | Pyramidal bi-directional recurrence |
+| `VariationalDropout` | Single binary dropout mask for entire sequence |
+| `SinusoidalPositionalEmbedding` | Non-learnable positional embedding (no max sequence length) |
+| `PositionalEmbedding` | Learnable Positional Embedding (used in BERT) |
+| `BertEmbeddings` | BERT Embeddings (word + token_type + positional) |
+| `BertPooler` | Pooler used in BERT |
+| `SpatialPyramidPooling` | Fixed pooled representation regardless of image input size |
+| `GatedLinearUnit` | Gated Convolutional Neural Network |
+| `ScaledDotProductAttention` | Attention used in BERT and Transformer (aka 'attention is all you need') |
+| `MultiHeadAttention` | Attention used in BERT and Transformer (aka 'attention is all you need') |
+| `GaussianWindowAttention` | Windowed attention instead of conventional attention where everything is attended at the same time |
+| `SequentialDense` | Applies Dense to a window of sequence item along sequence axis |
+| `SequentialMaxPooling` | Max pool across sequential axis and static axes |
+| `SequentialAveragePooling` | Average pool across sequential axis and static axes |
+| `SequentialConcatPooling` | Concat Average and Mean pool across sequential axis and static axes |
+| `vFSMN` | Vectorised Feedforward Sequential Memory Networks |
+| `cFSMN` | Compact Feedforward Sequential Memory Networks |
+| `BiRecurrence` | BiRecurrence recurrent layer with weight tying option to half parameter requirement |
+| `GlobalConcatPooling` | Global spatial concat pooling of ave and mean |
+|`FilterResponseNormalization`| Drop in replacement for batch norm with superior performance |
+|`Boom`| More parametrically efficient alternative to Position-Wise FeedForward layer found in Transformer |
+|`GaussianAttentionSeqImage`| Memory efficient attention that used 2d gaussian filters for images |
+| `SequenceDropout` | Dropout entire sequence elements |
+| `SEBlock` | Squeeze and Excitation block |
+| `SequenceSEBlock` | Squeeze and Excitation block for variable width image sequence |
+| `SIREN` | Sinusoidal Representation Network |
+| `LinearAttention` | Linearised form of dot-product attention with linear memory complexity |
+| `LinearAttentionModel` | Wrapper to `LinearAttention` |
+
+
+| Blocks | Description |
+| --- | ---|
+| `WeightDroppedLSTM` | A form of regularised LSTM |
+| `IndyLSTM` | A parameter efficient form of LSTM |
+| `IndRNN` | a RNN with long memory and can be stacked deeply |
+
+| Loss | Description |
+| --- | ---|
+| `gaussian_mdn_loss` | loss function when using Mixture density network |
+| `focal_loss_with_softmax` | A kind of cross entropy that handles extreme class imbalance |
+| `cross_entropy_with_softmax` | Added `label smoothing regularisation` in cross entropy with softmax |
+| `generalised_robust_barron_loss` | generalised robust loss |
+
+| Models | Description |
+| --- | ---|
+| `VGG` | Image Classification |
+| `UNET` | Semantic Segmentation |
+| `Transformer` | Language Modelling |
+| `MDN` | Mixture Density Networks | 
+
+
+| Pre-trained models | Description |
+| --- | ---|
+| `Bert` | Bidirectional Encoder Representations from Transformers |
+| [fwd_wt103.hdf5](https://1drv.ms/u/s!AjJ4XyC3prp8mItNxiawGK4gD8iMhA?e=wh7PLB) | The weight parameters of the fastai's pytorch model. To be used to initialise `PretrainedWikitext103LanguageModel` |
+| [fwd_wt103.cntk](https://1drv.ms/u/s!AjJ4XyC3prp8mItPBdfmDYr9QP7J4w?e=k1BXlW) | The converted cntk model of fastai's pytorch model. To be used with `C.load_model` |
+| [fwd_wt103.onnx](https://1drv.ms/u/s!AjJ4XyC3prp8mItO70T_q8HOPwa6aQ?e=h2Fiv5) | The converted ONNX model of fastai's pytorch model. |
+
+
+| Learners | Description |
+| --- | ---|
+| `CyclicalLearningRate` | a method to eliminate the need to find best value and schedule for learning rate |
+| `RAdam` | a variant of `Adam` that doesn't require any warmup |
+
+| Misc | Description |
+| --- | ---|
+| `CTCEncoder` | Helper class to convert data into a format acceptable for cntk's ctc implementation |
+
+
+## C# CNTK Tutorials
+This library is implemented in pure cntk python API. For help in cntk c#, you can refer to the two repository 
+[deep-learning-with-csharp-and-cntk](https://github.com/anastasios-stamoulis/deep-learning-with-csharp-and-cntk) 
+and [DeepBelief_Course4_Examples](https://github.com/AllanYiin/DeepBelief_Course4_Examples). 
+
+
+## F# CNTK
+For the F# wrapper of CNTK please visit [FsCNTK](https://github.com/fwaris/FsCNTK), 
+it also contains some example implementations like seq2seq, autoencoder, LSTM, GAN.
+
+
+## News
+***2021-04-04***
+### Added new kernel method into `LinearAttention`
+Previous `LinearAttention` uses implemntation from [Transformers are RNNs:Fast Autoregressive Transformers with Linear Attention](https://arxiv.org/abs/2006.16236) that uses `elu`.
+`elu` as kernel may not perform well in certain task (e.g. ASR) as documented in the paper.
+
+Hence, new kernel trick uses `softmax` from [Efficient Attention: Attention with Linear Complexities](https://arxiv.org/abs/1812.01243).
+
+
+
+***2020-09-06***
+### Added `sequence.pad_ctc_labels`
+Pads the ctc label sequence to the same sequence length as the network output.
+This should be used when the final sequence length of the network output cannot be determined
+beforehand during the pre-processing of the ctc_labels. Thus, the padding is done during training runtime
+instead of during the data pipeline processing.
+
+The padding token would be the last sequence element of `ctc_labels`. `ctc_labels` should be
+a one hot encoded vector sequence. The padding token will have the value of 1 in its one-hot encoded vector.
+
+
+***2020-06-20***
+### Added `LinearAttention` and `LinearAttentionModel`
+Added cntk implementation of `LinearAttention` and `LinearAttentionModel`.
+Standard dot-product attention found in `Transformer` is quadratic in time and gpu memory complexity
+with respect to sequence length. The `C.layers.AttentionModel` is also quadratic.
+
+However, `LinearAttention` is a linearised form of dot-product attention. It is linear in time and gpu memory complexity
+with respect to sequence length. This is especially significant since `cntk` doesn't have any checkpointing function
+compared to other frameworks like `tensorflow` and `pytorch`, where it saves gpu memory at the expense of some
+computation overhead. Now, with `LinearAttention` training of `Transformer` can be done on `cntk` since the
+ gpu memory requirement is drastically reduced.
+
+`LinearAttentionModel` is wrapper for `LinearAttention` in the style of `C.layers.AttentionModel`.
+
+For more details refer to [Transformers are RNNs:Fast Autoregressive Transformers with Linear Attention](https://arxiv.org/abs/2006.16236) by Katharopoulos et al.
+
+Example:
+
+    a = C.sequence.input_variable(24)
+    b = LinearAttention(hidden_dim=32, model_dim=24)(a, a, a)
+
+    assert b.shape == (32, )
+
+
+***2020-06-20***
+### Added `sequence.pad_to`
+`sequence.pad_to` pads a shorter sequence to the same sequence length as another longer sequence.
+This is especially useful in CTC training where both sequences need to have the same sequence length
+
+Example:
+
+    ax1 = C.Axis.new_unique_dynamic_axis('ax1')
+    ax2 = C.Axis.new_unique_dynamic_axis('ax2')
+    a = C.sequence.input_variable(3, sequence_axis=ax1)
+    b = C.sequence.input_variable(6, sequence_axis=ax2)
+
+    c = Cx.sequence.pad_to(a, b)  # pad to same sequence length
+
+    ctc_token = C.Constant(np.array([0, 0, 1]))
+    d = C.element_select(C.equal(c, 0), ctc_token, c)  # swap padded zeros with ctc token
+
+
+***2020-06-19***
+### Added `SIREN`
+`SIREN` leverages periodic activation functions for implicit neural representations and demonstrate
+that these networks are ideally suited for representing complex natural signals and their derivatives.
+
+The results are incredible. It is highly recommended that you check out their [project page](https://vsitzmann.github.io/siren/).
+
+More details can be found in their paper [Implicit Neural Representations with PeriodicActivation Functions](https://arxiv.org/abs/2006.09661)
+
+
+
+***2020-05-01***
+### Added `SEBlock` and `SequenceSEBlock`
+`SEBlock` and `SequenceSEBlock` are squeeze-excitation blocks that adaptively recalibrates channel-wise feature
+responses by explicitly modelling interdependencies between channels. It was show that these blocks can be
+stacked together to form SENet architectures that generalise extremely effectively across different datasets.
+
+Its the winner for ILSVRC2017 classification submission. More details can be found in the
+paper [Squeeze-and-Excitation Networks](https://arxiv.org/abs/1709.01507) by Jie hu et al.
+
+
+***2020-04-20***
+### Added `SequenceDropout`
+`SequenceDropout` dropouts entire sequence elements along the dynamic sequence axis.
+
+
+
+***2020-04-15***
+### Added `GaussianAttentionSeqImage`
+`GaussianAttentionSeqImage` is a 2d spatial gaussian attention.
+To use, the encoded image should be formulated as a cntk sequence (example below). This can be useful when 
+you are constraint by gpu memory as 2d gaussian attention is more memory efficient than standard attention.
+
+This is from the deepmind paper, DRAW: A Recurrent Neural Network for Image Generation by Gregor et al
+More details can be found in the following https://arxiv.org/abs/1502.04623
+
+Example:
+
+    n = 5
+    num_channels = 3
+    image_height = 64
+    expected_image_width = 1000
+    image_seq = C.sequence.input_variable((num_channels, image_height))  # rgb image with variable width and fixed height
+    decoder_hidden_state = ...  # from decoder somewhere in the network
+    attended_image = Cx.layers.GaussianAttentionSeqImage(n, image_height, expected_image_width)(image_seq, decoder_hidden_state)
+
+    assert attended_image.shape == (num_channels, n, n)
+
+
+***2020-03-29.***
+#### Added `generalised_robust_barron_loss` and `sample_with_bias`
+`generalised_robust_barron_loss` is a generalisation for generalization of the Cauchy/Lorentzian,
+Geman-McClure, Welsch/Leclerc, generalized Charbonnier, Charbonnier/pseudo-Huber/L1-L2, and
+L2 loss functions.
+
+Can be used as a drop-in replacement in any regression task that you have.
+
+For more details, please refer to [A General and Adaptive Robust Loss Function](https://arxiv.org/abs/1701.03077), Jonathan T. Barron,
+or this [video](https://www.youtube.com/watch?v=BmNKbnF69eY).
+It is the Best Paper Award Finalist in CVPR 2019.
+
+
+Implemented `sample_with_bias` to sample more likely classes as a replacement 
+for `sample_top_k` which cannot be used inside a `UnfoldFrom`. `sample_with_bias` reduces sampling variance especially when you have a long tail.
+
+
+
+***2020-02-11.***
+#### Added `Boom`
+Boom layer from SHA-RNN by S. Merity creator of QRNN. Alternative to PositionwiseFeedForward. Serves the same function as
+PositionwiseFeedForward found in transformer.
+
+Boom layer minimizes computation and removes an entire matrix of parameters compared to traditional down-projection layers.
+
+For more detail please read: [Single Headed Attention RNN: Stop Thinking With Your Head](https://arxiv.org/abs/1911.11423) by Stephen Merity.
+
+
+***2019-12-03.***
+#### Added `FilterResponseNormalization` and `ThresholdedLinearUnit`
+Added cntk implementation of `FilterResponseNormalization`. Filter Response Normalization (FRN) layer 
+is a novel combination of a normalization and an activation function,
+that can be used as a drop-in replacement for other normalizations and activations.
+
+The method operates on each activation map of each batch sample
+independently, eliminating the dependency on other batch samples or channels of the same sample.
+The method outperforms BN and all alternatives in a variety of settings for all batch sizes.
+FRN layer performs â‰ˆ0.7âˆ’1.0% better on top-1 validation accuracy than BN with large mini-batch sizes on
+Imagenet classification on InceptionV3 and ResnetV2-50 architectures. Further, it performs >1% better
+than GN on the same problem in the small mini-batch size regime. For object detection problem on COCO dataset,
+FRN layer outperforms all other methods by at least 0.3âˆ’0.5% in all batch size regimes.
+
+Please refer to the paper [Filter Response Normalization Layer: Eliminating Batch Dependence 
+in the Training of Deep Neural Networks](https://arxiv.org/abs/1911.09737v1) for more details .
+
+
+
+***2019-11-04.***
+#### Added `ops.floor_division`, `ops.remainder`, `sequence.window_causal` and `SequentialDense`
+Add in new operations stated above. `sequence.window` now has an additional argument that lets you control striding.
+`sequence.window_causal` creates causal window that doesn't leak future information into the past (preserve causality).
+`SequentialDense` convenience layer added to apply dense to a window of sequence item,
+much like `SequentialConvolution` but with better memory performance.
+
+
+
+***2019-11-04.***
+#### Added `SequentialConcatPooling`, `Cx.Sequence.reduce_concat_pool` and `GlobalConcatPooling`
+`Cx.Sequence.reduce_concat_pool` concatenates the last item in the sequence axis with the summarisation
+of the sequence represented by `reduce_max` and `reduce_mean` of the sequence axis. Anytime `C.sequence.last` is used,
+this can be a drop-in replacement.
+
+Example:
+
+    n = 32
+    a = C.sequence.input_variable(n)
+    b = Cx.sequence.reduce_concat_pool(a)
+
+    assert b.shape == (n * 3, )
+
+
+`SequentialConcatPooling` does spatial concat pooling over the sequential axis.
+Concat pooling is the concatenation of both average pooling and max pooling. In any situation where max or ave 
+pooling is appropriate, concat pooling can be used as a drop-in replacement and achieve improvements in performance.
+
+Example:
+
+    a = C.sequence.input_variable((3, 10))
+    b = SequentialConcatPooling(filter_shape=(2, 2), strides=2)(a)
+
+    assert b.shape == (6, 10)
+
+    n = [np.random.random((3, 10)).astype(np.float32),
+         np.random.random((3, 10)).astype(np.float32),
+         np.random.random((3, 10)).astype(np.float32),
+         np.random.random((3, 10)).astype(np.float32), ]
+
+    print(b.eval({a: n}))
+
+
+`GlobalConcatPooling` is the standard spatial concat pooling of both max pool and ave pool.
+
+
+***2019-10-15.***
+#### Added `mish` activation function and `RAdam` learner.
+`mish` is an activation function is introduced in [Mish: A Self Regularized Non-Monotonic Neural Activation Function](https://arxiv.org/abs/1908.08681v2)
+by Diganta Misra. Experiments show that `mish` tends to work better than both ReLU and Swish along with other standard
+activation functions in many deep networks across challenging datasets. For instance,
+in Squeeze Excite Net-18 for CIFAR 100 classification, the network with Mish had an increase in
+Top-1 test accuracy by 0.494% and 1.671% as compared to the same network with Swish and ReLU respectively.
+The similarity to Swish along with providing a boost in performance and its simplicity in implementation
+makes it easier for researchers and developers to use Mish in their Neural Network Models.
+
+This activation function is adopted in `fast ai`.
+
+Rectified Adam or `RAdam` is a `Adam` optimiser variant that doesn't require a warmup schedule, which `Adam` tends
+to need to maintain stability. In this cntk implementation, we added a `RAdam` like optimiser based
+on the work of [On the adequacy of untuned warmup for adaptive optimization](https://arxiv.org/abs/1910.04209) by Jerry Ma and Denis Yarats.
+
+`RAdam` is adopted in `fast ai` too.
+
+
+
+***2019-09-30.***
+#### Added `BiRecurrence` with weight tying
+Made improvement to weight tying of BiRecurrence by have one parameter tensor token for every state 
+in the step function per direction (forward and backward). This will allow forward and backward token
+to have more representational flexibility. Previously, all states use the same forward or backward token.
+
+
+***2019-09-06.***
+#### Added `BiRecurrence` with weight tying
+Add a wrapper to create a bidirectional recurrent layer using `BiRecurrence`. Included in the implementation
+is an option to half the number of parameters required by  bidirectional recurrent layer. This is done
+by only using one recurrent unit to do both forward and backward computation instead of the usual two.
+A forward and backward token is used to initialise the hidden state so that the recurrent unit can tell
+the directionality.
+
+More details can be found in the paper [Efficient Bidirectional Neural Machine Translation](https://arxiv.org/abs/1908.09329)
+
+Example:
+
+    a = C.sequence.input_variable(10)
+    b = BiRecurrence(LSTM(100), weight_tie=True)(a)
+
+    assert b.shape == (200, )
+
+
+***2019-08-29.***
+#### Added `PretrainedWikitext103LanguageModel`
+CNTK implementation of Fast AI's Universal  Language  ModelFine-tuning (ULMFiT) English model has been added.
+This language model was trained on Wikitext-103 and can be used as a base model for any downstream language task.
+
+It is also much more efficient to run compare to BERT and other Transformer language models.
+
+For more details, you can refer to the original paper [here](https://arxiv.org/abs/1801.06146)
+
+Example:
+
+    vocab_size = 238462
+    converted_hdf5_model_file_path = 'PATH/fwd_wt103.hdf5'  # this is not the original pytorch model
+    lm = PretrainedWikitext103LanguageModel(converted_hdf5_model_file_path)
+
+    a = C.sequence.input_variable(vocab_size)
+    prediction = lm(a)  # next-word-prediction
+    features = prediction.features  # features of tokens
+
+    assert prediction.shape == (vocab_size, )
+    assert features.shape == (400, )
+
+| Model | Description |
+| --- | ---|
+| [fwd_wt103.hdf5](https://1drv.ms/u/s!AjJ4XyC3prp8mItNxiawGK4gD8iMhA?e=wh7PLB) | The weight parameters of the fastai's pytorch model. To be used to initialise `PretrainedWikitext103LanguageModel` |
+| [fwd_wt103.cntk](https://1drv.ms/u/s!AjJ4XyC3prp8mItPBdfmDYr9QP7J4w?e=k1BXlW) | The converted cntk model of fastai's pytorch model. To be used with `C.load_model` |
+| [fwd_wt103.onnx](https://1drv.ms/u/s!AjJ4XyC3prp8mItO70T_q8HOPwa6aQ?e=h2Fiv5) | The converted ONNX model of fastai's pytorch model. |
+| [itos_wt103.pkl](http://files.fast.ai/models/wt103/) | Tokens used in pretrained model |
+
+
+***2019-08-08.***
+#### Added `cntkx.ops.gelu` and `cntkx.ops.gelu_fast`
+Added two cntk implementation of `gelu` activation function. `gelu` activation is used in `BERT`
+and OpenAI's `GPT` and `GPT-2`.
+
+Gaussian Error Linear Unit (GELU), a high-performing neuralnetwork activation function.
+The GELU nonlinearity is the expected transforma-tion of a stochastic regularizer which randomly
+applies the identity or zero mapto a neuronâ€™s input.  The GELU nonlinearity weights inputs by their
+magnitude,rather than gates inputs by their sign as in ReLUs.
+
+For more detail please refer to [Gaussian Error Linear Units (GELU)](https://arxiv.org/abs/1606.08415)
+by Hendrycks and Gimpel.
+
+
+***2019-07-04.***
+#### Added `cntkx.ops.sequence.reduce_mean`
+Calculates the mean along the dynamic sequential axis in CNTK.
+
+
+***2019-06-26.***
+#### Added `cntkx.ops.sequence.reverse`
+Allows the sequence items along the sequence axis to be reversed. This is useful when you want to create a 
+Bi-directional Auto-Regressive layer because using UnfoldFrom does not work with Recurrence(go_backwards=True) and
+ will result in a ValueError.
+
+
+Example:
+
+    import cntk as C
+    import cntkx as Cx
+    from cntk.layers import Recurrence, UnfoldFrom, LSTM
+
+    hidden_dim = 50
+    start_token = C.Constant(0, shape=(hidden_dim,))
+    a = C.sequence.input_variable(1, name='seq1')
+
+    b = UnfoldFrom(Recurrence(LSTM(hidden_dim), go_backwards=True))(start_token, a)
+
+    n = [np.random.random((10, hidden_dim)).astype(np.float32),]
+
+    # This raise 'ValueError: It is not allowed to have multiple different stepping directions in the same loop'
+    b.eval({b.arguments[0]: n})
+
+
+The workaround would be:
+
+    import cntk as C
+    import cntkx as Cx
+    from cntk.layers import Recurrence, UnfoldFrom, LSTM
+
+    hidden_dim = 50
+    start_token = C.Constant(0, shape=(hidden_dim,))
+    a = C.sequence.input_variable(1, name='seq1')
+    a_reversed = Cx.sequence.reverse(a)
+
+    b = UnfoldFrom(Recurrence(LSTM(hidden_dim)))(start_token, a_reversed)  # remove go_backwards=True
+
+    n = [np.random.random((10, hidden_dim)).astype(np.float32),]    
+    b.eval({b.arguments[0]: n})  # this will run just fine
+
+
+
+***2019-06-21.***
+#### Added `cntkx.ops.random.sample_top_k`
+CNTK implementation of that allows sampling of the top_k unnormalised log-probabilities distribution.
+This is useful text (or sequence) generation where it is known that greedy decoding will cause
+text degeneration.
+
+For more details on this, please refer to [A curious case of neural text degeneration](https://arxiv.org/abs/1904.09751)
+
+
+Example:
+
+    import cntk as C
+    import cntkx as Cx
+
+    a = C.input_variable(5)
+    b = Cx.random.sample_top_k(a, k=3, num_classes=5)
+
+    n = np.array([[1, 2, 3, 4, 5],] * 1000)
+
+    results = b.eval({a: n})
+    assert np.sum(results[:, :2]) == 0
+    assert np.sum(results[:, 2:]) == 1000
+
+***2019-05-04.***
+#### Added `cntkx.layers.vFSMN` and `cntkx.layers.cFSMN`
+CNTK implementation of Bidirectional vectorised Feedforward Sequential Memory Network (vFSMN)
+and Compact Feedforward Sequential Memory Network (cFSMN).
+
+FSMN is a standard fully-connected feedforward neural network equipped
+with some learnable memory blocks in its hidden layers. The memory blocks
+use a tapped-delay line structure to encode the long context information into
+a fixed-size representation as short-term memory mechanism.
+
+The authors claim that FSMNs can be learned much more reliably and faster than
+RNNs or LSTMs due to the inherent non-recurrent model structure while significantly
+outperforming RNNs in language and speech modeling.
+
+For more details please refer to [Feedforward Sequential Memory Networks: A 
+New Structure to Learn Long-term Dependency](https://arxiv.org/abs/1512.08301)
+
+cFSMN is a compact version of FSMN that can result in a reduction of up
+to 60% in model size and speed up the learning by more than 7 times while
+still significantly outperforming the popular bi-direction LSTMs for both
+frame-level cross-entropy (CE) criterion based training and MMI based sequence training.
+
+For more details please refer to "Compact Feedforward Sequential Memory Networks for
+Large VocabularyContinuous Speech Recognition" by Zhang, et al.
+
+Example:
+
+    import cntk as C
+    from cntkx.layers import vFSMN, cFSMN
+
+    # unidirectional vFSMN (only past conext used)
+    a = C.sequence.input_variable(10)
+    b = vFSMN(100, C.relu, num_past_context=3, num_future_context=0)(a)
+
+    assert b.shape == (100,)
+
+    # bidirectional vFSMN (enable both past and future context)
+    a = C.sequence.input_variable(10)
+    b = vFSMN(120, C.relu, num_past_context=3, num_future_context=3)(a)
+
+    assert b.shape == (120,)
+
+    # bidirectional cFSMN (enable both past and future context)
+    a = C.sequence.input_variable(100)
+    b = cFSMN(120, 50, C.relu, num_past_context=3, num_future_context=3)(a)
+
+    assert b.shape == (120,)
+
+
+***2019-04-19.***
+#### Added `cntkx.misc.CTCEncoder`
+CNTK's CTC implementation requires that data be formatted in a particular way that's typically in acoustic
+modeling but unusual in other applications. So class provides an easy way to convert data between
+what users typically expect and what cntk demands.
+
+Example:
+    labels = ['a', 'b', 'c']
+    encoder = CTCEncoder(labels)
+
+    labels_tensor = C.sequence.input_variable(len(encoder.classes_))  # number of classes = 4
+    input_tensor = C.sequence.input_variable(100)
+
+    labels_graph = cntk.labels_to_graph(labels_tensor)
+    network_out = model(input_tensor)
+
+    fb = C.forward_backward(labels_graph, network_out, blankTokenId=encoder.blankTokenId)
+
+    ground_truth = ['a', 'b', 'b', 'b', 'c']
+    seq_length = 10  # must be the same length as the sequence length in network_out
+
+    fb.eval({input_tensor: [...],
+             labels_tensor: [encoder.transform(ground_truth, seq_length=seq_length)]})
+
+
+
+***2019-04-14.***
+#### Added `Label Smoothing Regularization`, `seqeuence.window` and `PyramidalBiRecurrence`
+Added `Label Smoothing Regularization` in `cross_entropy_with_softmax`.
+Added `sequence.window` that creates non-overlapping window along the sequence axis thereby reducing the 
+sequence length and increasing the dimension by the same factor.
+
+Implemented a convenience layer used in acoustic modelling known as `PyramidalBiRecurrence`. Used to create
+pyramidal bi-directional LSTM (BLSTM) found in "Listen, attend and spell" by Chan et al. (https://arxiv.org/abs/1508.01211)
+Typically used to down sample the sequence length to make memory and runtime manageable.
+
+
+***2019-04-08.***
+#### Added `cntkx.ops.sequence.join`
+Added a new `op` called `join` where two sequence tensors can be joined along with sequence axis forming a longer sequence.
+
+
+***2019-04-08.***
+#### Added `cntkx.layers.SequentialAveragePooling`
+Add average pooling layer that works with sequential axis. Current cntk `AveragePooling` doesn't pool across sequence elements.
+
+Example on `cntkx.layers.SequentialAveragePooling`
+
+    # rgb image of height 25 and variable width
+    a = C.sequence.input_variable((3, 25))
+
+    # Convolute across image with (3, 3) kernel with stride (1, 1)
+    b = C.layers.SequentialConvolution(filter_shape=(3, 3), num_filters=16, stride=(1, 1), pad=True)(a)
+
+    assert b.shape == (16, 25)
+
+    # max pool (2,2) in height and width with stride (2,2) in height and width, no padding
+    c = SequentialAveragePooling(filter_shape=(2, 2), strides=(2, 2), pad=False)(b)
+
+    assert c.shape == (16, 12)
+
+
+***2019-04-07.***
+#### Added `cntkx.sequence.stride` and `cntkx.ops.scalar`
+`Cx.sequence.stride` enables striding across the sequential axis, selecting every integer items along the sequence.
+`Cx.scalar` converts tensor into scalar of shape `(1,)` 
+
+
+
+***2019-04-06.***
+#### Added `IndyLSTM` and `IndRNN`
+CNTK implementation of [Independently Recurrent Long Short-term Memory cells: IndyLSTMs](https://arxiv.org/abs/1903.08023)
+by Gonnet and Deselaers, and [Independently Recurrent Neural Network (IndRNN): Building A Longer andDeeper RNN](https://arxiv.org/abs/1803.04831)
+by Li, et al.
+
+Both `IndyLSTM` and `IndRNN` have hidden-to-hidden weights that are diagonal matrix instead of the usual full matrix.
+Thus neurons in each layer are independent from each other, and the cross-channel information is 
+obtained through stacking multiple layers.
+
+`IndRNN` allows for the use of `C.relu` activation thus allowing multiple `IndRNN` layers to be stacked together deeply.
+
+`IndyLSTM` has parameters linear to the number of nodes in the linear, as opposed to standard LSTM that is quadratic
+making `IndyLSTM` potentially faster and smaller as a model.
+
+Authors of both `IndRNN` and `IndyLSTM` have claimed performance as good as or even better than regular LSTMs.
+
+Example:
+
+    import cntk as C
+    from cntkx.layers import IndyLSTM, IndRNN, Recurrence
+
+    a = C.sequence.input_variable(10)
+    b = Recurrence(IndRNN(20))(a)
+    c = Recurrence(IndyLSTM(20))(a)
+
+    assert b.shape == c.shape == (20,)
+
+
+
+***2019-03-24.***
+#### Added `cntkx.layers.SequentialMaxPooling`
+Add max pooling layer that works with sequential axis. Current cntk `MaxPooling` doesn't pool across sequence elements.
+
+Example on `cntkx.layers.SequentialMaxPooling`
+
+    # rgb image of height 25 and variable width
+    a = C.sequence.input_variable((3, 25))
+
+    # Convolute across image with (3, 3) kernel with stride (1, 1)
+    b = C.layers.SequentialConvolution(filter_shape=(3, 3), num_filters=16, stride=(1, 1), pad=True)(a)
+
+    assert b.shape == (16, 25)
+
+    # max pool (2,2) in height and width with stride (2,2) in height and width, no padding
+    c = SequentialMaxPooling(filter_shape=(2, 2), strides=(2, 2), pad=False)(b)
+
+    assert c.shape == (16, 12)
+
+
+***2019-03-18.***
+#### Added `cntkx.learners.CyclicalLearningRate`
+Cyclical learning rate (CLR) is an implementation to that  practically eliminates the need to 
+experimentally find the best values and schedule  for  the global  learning  rates.
+
+Instead  of  monotonically decreasing the learning rate, this method lets the learning  
+rate  cyclically  vary  between  reasonable  boundary  values. Training  with  
+cyclical  learning  rates  instead of  fixed  values  achieves improved  classification 
+accuracy without a need to tune and often in fewer iterations.
+
+More details can be found in [Cyclical Learning Rates for Training Neural Networks](https://arxiv.org/abs/1506.01186) 
+by Leslie N. Smith
+
+This CLR implementation can be used with the cntk training loop by adding only ***two lines of code***:
+
+    model = C.layers.Dense(10)(C.input_variable(10))
+    sgd_momentum = C.momentum_sgd(model.parameters, 0.1, 0.9)
+    clr = CyclicalLeaningRate(sgd_momentum, minibatch_size=32)  # first line of code
+
+    for epoch in range(10):
+        for batch in range(100):
+            trainer.train_minibatch(...)
+            clr.batch_step()  # second line of code (to be called after every training update)
+
+
+
+
+***2019-03-12.***
+#### Added `cntkx.ops.batchmatmul`
+Added Batch Matrix Multiplication. This implementation is similar 
+to [tensorflow.matmul](https://www.tensorflow.org/api_docs/python/tf/linalg/matmul).
+
+Example:
+
+    a = C.sequence.input_variable((3, 4, 5))     # batch matrix
+    b = C.sequence.input_variable((3, 5, 6))     # batch matrix
+    c = Cx.batchmatmul(a, b)
+    assert c.shape == (3, 4, 6)                  # 3 is treated as a batch axis
+
+
+
+***2019-03-10.***
+#### Added `PretrainedBertEncoder` and `PretrainedBertModel`
+BERT, the state-of-the-art language model is now available as a CNTK pretrained model.
+
+Currently, it is only tested to work with `BERT-Base, Uncased` (uncased_L-12_H-768_A-12) and can be
+downloaded from [Google AI](https://github.com/google-research/bert)
+
+When you have downloaded `BERT-Base, Uncased`, there should be 5 files inside. You will need to `.zip`
+three of those files into a tensorflow checkpoint file before you can load it into `cntkx`.
+
+Those three files are: `bert_model.ckpt.data-00000-of-00001`, `bert_model.ckpt.index`, `bert_model.ckpt.meta`.
+Then rename the extension of `.zip` into `.ckpt` and you are good to go.
+
+Example below
+
+    text_tensor = C.sequence.input_variable(30522)
+    token_type_tensor = C.sequence.input_variable(2)
+    filepath_to_tf_bert_model = "YOUR_FILE_DIRECTORY/bert_model.ckpt"
+
+    model = Cx.layers.PreTrainedBertModel(filepath_to_tf_bert_model, num_heads=12, dropout_rate=0.1)
+    b = model(text_tensor, token_type_tensor)
+
+    assert b.shape == (768,)
+
+For more details about BERT, you can find the original paper [here](https://arxiv.org/abs/1810.04805), 
+and some useful resources [here](https://towardsdatascience.com/bert-explained-state-of-the-art-language-model-for-nlp-f8b21a9b6270) 
+and [here](http://jalammar.github.io/illustrated-bert/).
+
+Note:
+It goes without saying also that to use these pre-trained models you will need to have tensorflow installed
+since we are convert them from tensorflow models.
+
+
+***2019-03-06.***
+#### Added `PositionalEmbedding`, `BertEmbeddings` and `PretrainedBertEmbeddings`
+CNTK implementation of `PositionalEmbedding`, `BertEmbeddings` and tf-to-cntk `PreTrainedBertEmbeddings`.
+BERT is a state-of-the-art language model from Google AI, more details can be found in
+[BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805).
+
+Google AI's pre-trained BERT tensorflow model can be downloaded [here](https://github.com/google-research/bert).
+Tensorflow would need to be installed in your environment if you intend to use `PreTrainedBertEmbeddings`, which
+takes a tensorflow model and convert it cntk.
+
+Example for `PositionalEmbedding`
+
+    a = C.sequence.input_variable(12)
+    b = PositionalEmbedding(max_seq_length, hidden_dim)(a)
+
+    assert b.shape == (hidden_dim, )
+
+Example for `BertEmbeddings`
+
+    text_tensor = C.sequence.input_variable(100)
+    token_type_tensor = C.sequence.input_variable(2)
+    b = BertEmbeddings(max_seq_length, hidden_dim, 0.1)(text_tensor, token_type_tensor)
+
+    assert b.shape == (hidden_dim, )
+
+Example for `PreTrainedBertEmbeddings`
+
+    text_tensor = C.sequence.input_variable(30522)
+    token_type_tensor = C.sequence.input_variable(2)
+    filepath_to_tf_bert_model = "YOURFILEPATH"
+    embeddings = PreTrainedBertEmbeddings(filepath_to_tf_bert_model, 0.1, False)
+    b = embeddings(text_tensor, token_type_tensor)
+
+    assert b.shape == (768, )
+
+
+***2019-03-02.***
+#### Added `VariationalDropout` and `WeightDroppedLSTM`
+CNTK implementation of `Variational Dropout` found in 
+[A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](https://arxiv.org/abs/1512.05287)
+and `Weight Dropped LSTM` proposed in a salesforce research paper 
+[Regularizing and Optimizing LSTM Language Models](https://arxiv.org/abs/1708.02182).
+
+`Weight Dropped LSTM` is a regularised LSTM that uses DropConnect on hidden-to-hidden weights as a form of recurrent
+regularisation. It also include application of variational dropout on the inputs and outputs of the recurrent units
+for further regularisation.
+
+`Variational Drpoout` is a regularisation that uses same dropout mask at each time step 
+(i.e. across the dynamic sequence axis) as opposed to the naive application of `C.layers.Dropout` to a sequence
+which will result in a different dropout mask for every tensor along the sequence axis.
+
+
+    import cntkx as Cx
+    from cntkx.layers import Recurrence, WeightDroppedLSTM
+    import cntk as C
+
+    dropconnect_rate = 0.2
+    variationaldrop_rate = 0.1
+
+    seq = C.sequence.input_variable(56)
+    b = Recurrence(WeightDroppedLSTM(20, dropconnect_rate),
+                   variational_dropout_rate_input=variationaldrop_rate,
+                   variational_dropout_rate_output=variationaldrop_rate)(seq)
+
+    assert b.shape == (100, )
+
+    seq_dropped = VariationalDropout(0.1)(seq)
+
+    assert seq_dropped.shape == seq.shape
+
+
+***2019-02-02.***
+#### Added Gated Linear Unit / Gated CNN
+CNTK implementation of Gated Linear Unit (Gated CNN) founnd in Facebook AI Research Lab's paper:
+[Language Modeling with Gated Convolutional Networks](https://arxiv.org/abs/1612.08083).
+This paper applies a convolutional approach to language modelling with a novel Gated-CNN model.
+
+    import cntkx as Cx
+    import cntk as C
+
+    seq = C.sequence.input_variable(56)
+    hidden = Cx.layers.GatedLinearUnit(window=2, hidden_dim=100)(seq)
+
+    assert hidden.shape == (100, )
+
+
+***2019-01-21.***
+#### Added `Focal Loss` for multi-class and binary classification
+CNTK implementation of `Focal Loss` enables the training of highly accurate dense object detectors in the
+presence of vast numbers of easy background examples or dataset with extreme class imbalance (e.g. 1:1000).
+
+`Focal Loss` focuses training on a sparse set of hard examples and prevents the vast number of easy negatives from
+overwhelm-ing the model during training. 
+
+For more details please refer to [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002)
+
+    import cntkx as Cx
+
+    Cx.focal_loss_with_softmax([[0, 0, 0.8, 0.2]], [[0, 0, 1, 0]]).eval()
+    array([[0.31306446]], dtype=float32)
+
+
+
+***2019-01-18.***
+#### Added Gaussian Window Attention Model
+Gaussian window attention model was first introduced by Alex Graves in 
+"Generating sequences with recurrent neural networks".
+
+It uses a mixture of gaussian windows to attend to 
+portions of the sequence as oppose to the widely used attention model introduced in 
+"Neural machine translation by jointly learning to align and translate" by Bahdanau, et al. that attends
+to the entire sequence all at once.
+
+Gaussian window attention is also directional in its attention on the context sequence. When modeling
+strongly ordered sequences, gaussian window attention will be a natural choice due to this inductive bias.
+
+    import cntk as C
+    import cntkx as Cx
+
+    seq1 = C.Axis.new_unique_dynamic_axis('seq1')
+    seq2 = C.Axis.new_unique_dynamic_axis('seq2')
+
+    encoded = C.sequence.input_variable(30, sequence_axis=seq1)
+    query = C.sequence.input_variable(28, sequence_axis=seq2)
+
+    a = Cx.layers.GaussianWindowAttention(10)(encoded, query)
+
+    assert a.shape == (30, )
+
+"Generating sequences with recurrent neural networks" can be found [here](https://arxiv.org/abs/1308.0850).
+"Neural machine translation by jointly learning to align and translate" can be found [here](https://arxiv.org/abs/1409.0473).
+
+***2019-01-16.***
+#### Added Spatial Pyramid Pooling layer
+Spatial pyramid pooling layer is a pooling layer than returns a fixed length representation regardless of the 
+image size/scale. It is frequently used for multi-size image training. It reported SOTA classification results using
+a single full-image representation without fine-tuning. For more details on the paper
+"Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition" by K. He, X. Zhang, S. Ren, J. Sun,
+link [here](https://arxiv.org/abs/1406.4729).
+
+    import cntk as C
+    import cntkx as Cx
+
+    n = np.random.random((3, 3, 32, 32)).astype(np.float32)
+    a = C.input_variable((3, 32, 32))
+    b = Cx.layers.SpatialPyramidPooling((1, 2, 4))(a)
+
+    assert b.shape == (3 * (4 * 4 + 2 * 2 + 1),)  # representation not dependent on image size
+
+
+***2019-01-15.***
+#### Added Sinusoidal Positional Embedding and `cntkx.ops.erf`
+Added sinusoidal positional embedding used in [Transformer](https://arxiv.org/abs/1706.03762). For an accessible
+explanation of transformer, you may look up [here](http://jalammar.github.io/illustrated-transformer/).
+
+    import cntk as C
+    import cntkx as Cx
+
+    a = C.sequence.input_variable(10)
+    b = SinusoidalPositionalEmbedding()(a)
+
+    assert b.shape == (10, )
+
+Added `cntkx.ops.erf` error function.
+
+***2019-01-12.***
+#### Added Vision models: VGG16, VGG19 and UNET
+VGG is for image classification and UNET is for semantic segmentation. VGG is implemented for completeness 
+sake and should not be used for any serious classification task.
+
+
+Paper on VGG can be found [here](https://arxiv.org/abs/1409.1556) titled "Very Deep Convolutional Networks 
+for Large-Scale Image Recognition"
+
+Paper for UNET can be found [here](https://arxiv.org/abs/1505.04597) titled "U-Net: Convolutional Networks 
+for Biomedical Image Segmentation"
+
+VGG example:
+
+    import cntk as C
+    import cntkx as Cx
+
+    a = C.input_variable((3, 64, 64))
+    b = Cx.layers.VGG19(100)(a)
+
+    assert b.shape == (100,)
+
+UNET example:
+
+    import cntk as C
+    import cntkx as Cx
+
+    a = C.input_variable((3, 512, 512))
+    b = Cx.layers.UNET(num_classes=10, base_num_filters=64, pad=True)(a)
+
+    assert b.shape == (10, 512, 512)
+
+Convenience functions such as `cntkx.ops.upsample` and `centre_crop` have also been added.
+`cntkx.ops.upsample` upsamples an image twice on each spatial dim. `centre_crop` crops a smaller image from
+a bigger one in the centre given a reference smaller image.
+
+
+#### Added Transformer attention model and associated components
+The Transformer was first introduced in the [paper](https://arxiv.org/abs/1706.03762) 'Attention is all you need'.
+The architecture is based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.
+More recently, [BERT](https://arxiv.org/abs/1810.04805) which broke almost all SOTA language task is also based on 
+transformer and self-attention.
+
+    import cntk as C
+    import cntkx as Cx
+
+    a = C.sequence.input_variable(512)
+    b = C.sequence.input_variable(512)
+
+    transformer = Cx.layers.Transformer()  # using default settings
+    decoded = transformer(a, b)
+
+    assert decoded.shape == (512, )
+
+
+***2018-12-08.***
+#### Added QRNN: Quasi-Recurrent Neural Network (QRNN) and `cntkx.ops.cumsum`
+The QRNN provides similar accuracy to the LSTM but can be betwen 2 and 17 times faster than the 
+highly optimized NVIDIA cuDNN LSTM implementation depending on the use case.
+
+More details please refer to the original paper [here](https://arxiv.org/abs/1611.01576).
+
+    import cntk as C
+    import cntkx as Cx
+
+    input_seq = C.sequence.input_variable(input_dim)
+    prediction_seq = Cx.layers.QRNN(hidden_dim=50)(input_seq)
+
+
+
+***2018-12-07.***
+#### New sequence ops: `cntkx.ops.sequence.pad` and `cntkx.ops.sequence.length`
+Added two new sequence ops. `cntkx.ops.sequence.pad` allows padding on the sequence axis and 
+`cntkx.ops.sequence.length` calculates the length of the sequence.
+
+***2018-12-05.***
+#### Mixture Density Network
+Mixture density networks are neural networks that can in principle represent arbitrary conditional 
+probability distributions in the same way that a conventional neural network can represent arbitrary functions.
+MDN are very useful when you need to map an input to several correct targets (aka. one-to-many problem).
+
+Updated with Gaussian Mixture Density Network ops and loss function. Ops will allow you to extract mdn coefficients and sample from the network.
+
+More details on mdn can be found in this [paper](https://publications.aston.ac.uk/373/1/NCRG_94_004.pdf) by Christopher Bishop.
+
+    import cntk as C
+    import cntkx as Cx
+
+    input_tensor = C.input_variable(1, name="input_tensor")
+    target_tensor = C.input_variable(1, name="target_tensor")
+
+    # model
+    inner = Dense(50, activation=C.relu)(input_tensor)
+    inner = Dense(50, activation=C.relu)(inner)
+    prediction_tensor = Dense((ndim + 2) * nmix, activation=None)(inner)
+
+    sampled = Cx.sample_gaussian_mdn(prediction_tensor, nmix, ndim)  # sampling node
+    loss = Cx.gaussian_mdn_loss(prediction_tensor, target_tensor, nmix=nmix, ndim=ndim)  # loss function
+
+
+
+%prep
+%autosetup -n cntkx-0.1.54
+
+%build
+%py3_build
+
+%install
+%py3_install
+install -d -m755 %{buildroot}/%{_pkgdocdir}
+if [ -d doc ]; then cp -arf doc %{buildroot}/%{_pkgdocdir}; fi
+if [ -d docs ]; then cp -arf docs %{buildroot}/%{_pkgdocdir}; fi
+if [ -d example ]; then cp -arf example %{buildroot}/%{_pkgdocdir}; fi
+if [ -d examples ]; then cp -arf examples %{buildroot}/%{_pkgdocdir}; fi
+pushd %{buildroot}
+if [ -d usr/lib ]; then
+	find usr/lib -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+if [ -d usr/lib64 ]; then
+	find usr/lib64 -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+if [ -d usr/bin ]; then
+	find usr/bin -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+if [ -d usr/sbin ]; then
+	find usr/sbin -type f -printf "/%h/%f\n" >> filelist.lst
+fi
+touch doclist.lst
+if [ -d usr/share/man ]; then
+	find usr/share/man -type f -printf "/%h/%f.gz\n" >> doclist.lst
+fi
+popd
+mv %{buildroot}/filelist.lst .
+mv %{buildroot}/doclist.lst .
+
+%files -n python3-cntkx -f filelist.lst
+%dir %{python3_sitelib}/*
+
+%files help -f doclist.lst
+%{_docdir}/*
+
+%changelog
+* Mon May 15 2023 Python_Bot <Python_Bot@openeuler.org> - 0.1.54-1
+- Package Spec generated
diff --git a/sources b/sources
new file mode 100644
index 0000000..fcb356d
--- /dev/null
+++ b/sources
@@ -0,0 +1 @@
+6e42761f976a4fe233b2570ba3edea2c  cntkx-0.1.54.tar.gz
-- 
cgit v1.2.3