diff --git a/README.rst b/README.rst
index 565bbbc..6d65800 100644
--- a/README.rst
+++ b/README.rst
@@ -42,7 +42,7 @@ It can also be used with a python environment in the following manner:
     from keras2c import k2c
     k2c(model, function_name, malloc=False, num_tests=10, verbose=True)
 
-For more information, see `Installation <https://f0uriest.github.io/keras2c/installation.html>`_ and  `Usage <https://f0uriest.github.io/keras2c/usage.html>`_
+For more information, see `Installation <https://plasmacontrol.github.io/keras2c/installation.html>`_ and  `Usage <https://plasmacontrol.github.io/keras2c/usage.html>`_
 
 
 Supported Layers
@@ -73,9 +73,9 @@ ToDo
 Contribute
 **********
 
-- Documentation: `<https://f0uriest.github.io/keras2c/>`_
-- Issue Tracker: `<https://github.com/f0uriest/keras2c/issues>`_
-- Source Code: `<https://github.com/f0uriest/keras2c/>`_
+- Documentation: `<https://plasmacontrol.github.io/keras2c/>`_
+- Issue Tracker: `<https://github.com/plasmacontrol/keras2c/issues>`_
+- Source Code: `<https://github.com/plasmacontrol/keras2c/>`_
   
 License
 *******
@@ -83,14 +83,14 @@ License
 The project is licensed under the LGPLv3 license.
 
 
-.. |Build-Status| image:: https://travis-ci.org/f0uriest/keras2c.svg?branch=master
-    :target: https://travis-ci.org/f0uriest/keras2c
+.. |Build-Status| image:: https://travis-ci.org/plasmacontrol/keras2c.svg?branch=master
+    :target: https://travis-ci.org/plasmacontrol/keras2c
     :alt: Build Status
-.. |Codecov| image:: https://codecov.io/gh/f0uriest/keras2c/branch/master/graph/badge.svg
-    :target: https://codecov.io/gh/f0uriest/keras2c
+.. |Codecov| image:: https://codecov.io/gh/plasmacontrol/keras2c/branch/master/graph/badge.svg
+    :target: https://codecov.io/gh/plasmacontrol/keras2c
     :alt: Code Coverage
-.. |License| image:: https://img.shields.io/github/license/f0uriest/keras2c
-    :target: https://github.com/f0uriest/keras2c/blob/master/LICENSE
+.. |License| image:: https://img.shields.io/github/license/plasmacontrol/keras2c
+    :target: https://github.com/plasmacontrol/keras2c/blob/master/LICENSE
     :alt: License: LGPLv3
 .. |DOI| image:: https://zenodo.org/badge/193152058.svg
     :target: https://zenodo.org/badge/latestdoi/193152058
diff --git a/include/k2c_activations.c b/include/k2c_activations.c
index 909256e..14ce8c0 100644
--- a/include/k2c_activations.c
+++ b/include/k2c_activations.c
@@ -51,19 +51,15 @@ k2c_activationType * k2c_exponential = k2c_exponential_func;
 void k2c_relu_func(float * x, const size_t size) {
 
     for (size_t i=0; i < size; ++i) {
-        if (x[i] <= 0.0f) {
-            x[i] = 0.0f;
-        }
+        x[i] = x[i] > 0.0f ? x[i] : 0.0f;
     }
 }
 k2c_activationType * k2c_relu = k2c_relu_func;
 
 
 /**
- * ReLU activation function.
- *   y = {1          if      x> 2.5}
- *       {0.2*x+0.5  if -2.5<x< 2.5}
- *       {0          if      x<-2.5}
+ * Hard sigmoid activation function.
+ *   y = clip(x+3, 0, 6) / 6
  *
  * :param x: array of input values. Gets overwritten by output.
  * :param size: length of input array.
@@ -71,14 +67,15 @@ k2c_activationType * k2c_relu = k2c_relu_func;
 void k2c_hard_sigmoid_func(float * x, const size_t size) {
 
     for (size_t i=0; i < size; ++i) {
-        if (x[i] <= -2.5f) {
+        float val = x[i] + 3.0f;
+        if (val <= 0.0f) {
             x[i] = 0.0f;
         }
-        else if (x[i]>=2.5f) {
+        else if (val >= 6.0f) {
             x[i] = 1.0f;
         }
         else {
-            x[i] = 0.2f*x[i] + 0.5f;
+            x[i] = val / 6.0f;
         }
     }
 }
@@ -116,6 +113,24 @@ void k2c_sigmoid_func(float * x, const size_t size) {
 }
 k2c_activationType * k2c_sigmoid = k2c_sigmoid_func;
 
+/**
+ * swish activation function.
+ *   y = x * (1/(1+exp(-x)))
+ *
+ * :param x: array of input values. Gets overwritten by output.
+ * :param size: length of input array.
+ */
+void k2c_swish_func(float * x, const size_t size) {
+
+    for (size_t i = 0; i < size; ++i) {
+        float xv = x[i];
+        float v = xv;
+        if (v < -30.0f) v = -30.0f; // Clamp to avoid overflow
+        x[i] = xv / (1.0f + expf(-v));
+    }
+}
+k2c_activationType * k2c_swish = k2c_swish_func;
+
 
 /**
  * Soft max activation function.
@@ -186,18 +201,18 @@ k2c_activationType * k2c_softsign = k2c_softsign_func;
 /**
  * Leaky version of a Rectified Linear Unit.
  * It allows a small gradient when the unit is not active:
- *   y = {alpha*x    if x < 0}
+ *   y = {negative_slope*x    if x < 0}
  *       {x          if x >= 0}
  *
  * :param x: array of input values. Gets overwritten by output.
  * :param size: length of input array.
- * :param alpha: slope of negative portion of activation curve.
+ * :param negative_slope: slope of negative portion of activation curve.
  */
-void k2c_LeakyReLU(float * x, const size_t size, const float alpha) {
+void k2c_LeakyReLU(float * x, const size_t size, const float negative_slope) {
 
     for (size_t i=0; i<size; ++i) {
         if (x[i]<0) {
-            x[i] = alpha*x[i];
+            x[i] = negative_slope*x[i];
         }
     }
 }
@@ -277,11 +292,307 @@ void k2c_ReLU(float * x, const size_t size, const float max_value,
               const float alpha, const float theta) {
 
     for (size_t i=0; i<size; ++i) {
-        if (x[i] >= max_value) {
-            x[i] = max_value;
+        float val = x[i];
+        val = val < theta ? alpha*(val - theta) : val;
+        val = val > max_value ? max_value : val;
+        x[i] = val;
+    }
+}
+
+
+/**
+ * SELU (Scaled Exponential Linear Unit) activation function.
+ *   y = scale * (x          if x >= 0)
+ *   y = scale * (alpha*(exp(x)-1) if x <  0)
+ *   scale = 1.0507009873554805, alpha = 1.6732632423543772
+ *
+ * :param x: array of input values. Gets overwritten by output.
+ * :param size: length of input array.
+ */
+void k2c_selu_func(float * x, const size_t size) {
+
+    const float alpha = 1.6732632423543772f;
+    const float scale = 1.0507009873554805f;
+    for (size_t i=0; i < size; ++i) {
+        if (x[i] >= 0.0f) {
+            x[i] = scale * x[i];
+        }
+        else {
+            x[i] = scale * alpha * expm1f(x[i]);
+        }
+    }
+}
+k2c_activationType * k2c_selu = k2c_selu_func;
+
+
+/**
+ * ELU (Exponential Linear Unit) activation function (default alpha=1.0).
+ *   y = x              if x >= 0
+ *   y = exp(x) - 1     if x <  0
+ *
+ * :param x: array of input values. Gets overwritten by output.
+ * :param size: length of input array.
+ */
+void k2c_elu_func(float * x, const size_t size) {
+
+    for (size_t i=0; i < size; ++i) {
+        if (x[i] < 0.0f) {
+            x[i] = expm1f(x[i]);
+        }
+    }
+}
+k2c_activationType * k2c_elu = k2c_elu_func;
+
+
+/**
+ * GELU (Gaussian Error Linear Unit) activation function.
+ *   y = 0.5 * x * (1 + erf(x / sqrt(2)))
+ *
+ * :param x: array of input values. Gets overwritten by output.
+ * :param size: length of input array.
+ */
+void k2c_gelu_func(float * x, const size_t size) {
+
+    const float sqrt2_inv = 0.7071067811865475f;
+    for (size_t i=0; i < size; ++i) {
+        x[i] = 0.5f * x[i] * (1.0f + erff(x[i] * sqrt2_inv));
+    }
+}
+k2c_activationType * k2c_gelu = k2c_gelu_func;
+
+
+/**
+ * Hard SiLU (Hard Swish) activation function.
+ *   y = x * hard_sigmoid(x) = x * clip(x+3, 0, 6) / 6
+ *
+ * :param x: array of input values. Gets overwritten by output.
+ * :param size: length of input array.
+ */
+void k2c_hard_silu_func(float * x, const size_t size) {
+
+    for (size_t i=0; i < size; ++i) {
+        float val = x[i] + 3.0f;
+        if (val <= 0.0f) {
+            x[i] = 0.0f;
+        }
+        else if (val >= 6.0f) {
+            /* hard_sigmoid = 1, so x * 1 = x; leave unchanged */
+        }
+        else {
+            x[i] = x[i] * val / 6.0f;
+        }
+    }
+}
+k2c_activationType * k2c_hard_silu = k2c_hard_silu_func;
+
+
+/**
+ * Mish activation function.
+ *   y = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
+ *
+ * :param x: array of input values. Gets overwritten by output.
+ * :param size: length of input array.
+ */
+void k2c_mish_func(float * x, const size_t size) {
+
+    for (size_t i=0; i < size; ++i) {
+        float sp = x[i] > 20.0f ? x[i] : log1pf(expf(x[i]));
+        x[i] = x[i] * tanhf(sp);
+    }
+}
+k2c_activationType * k2c_mish = k2c_mish_func;
+
+
+/**
+ * ReLU6 activation function.
+ *   y = min(max(x, 0), 6)
+ *
+ * :param x: array of input values. Gets overwritten by output.
+ * :param size: length of input array.
+ */
+void k2c_relu6_func(float * x, const size_t size) {
+
+    for (size_t i=0; i < size; ++i) {
+        if (x[i] < 0.0f) {
+            x[i] = 0.0f;
+        }
+        else if (x[i] > 6.0f) {
+            x[i] = 6.0f;
+        }
+    }
+}
+k2c_activationType * k2c_relu6 = k2c_relu6_func;
+
+
+/**
+ * Log-softmax activation function.
+ *   y[i] = x[i] - log(sum(exp(x)))
+ * Computed in a numerically stable manner.
+ *
+ * :param x: array of input values. Gets overwritten by output.
+ * :param size: length of input array.
+ */
+void k2c_log_softmax_func(float * x, const size_t size) {
+
+    float xmax = x[0];
+    for (size_t i=1; i < size; ++i) {
+        if (x[i] > xmax) {
+            xmax = x[i];
+        }
+    }
+    float sum = 0.0f;
+    for (size_t i=0; i < size; ++i) {
+        sum += expf(x[i] - xmax);
+    }
+    float log_sum = logf(sum);
+    for (size_t i=0; i < size; ++i) {
+        x[i] = (x[i] - xmax) - log_sum;
+    }
+}
+k2c_activationType * k2c_log_softmax = k2c_log_softmax_func;
+
+
+/**
+ * Leaky ReLU activation function (default negative_slope=0.2).
+ *   y = x                if x >= 0
+ *   y = 0.2 * x          if x <  0
+ *
+ * :param x: array of input values. Gets overwritten by output.
+ * :param size: length of input array.
+ */
+void k2c_leaky_relu_func(float * x, const size_t size) {
+
+    for (size_t i=0; i < size; ++i) {
+        if (x[i] < 0.0f) {
+            x[i] = 0.2f * x[i];
+        }
+    }
+}
+k2c_activationType * k2c_leaky_relu = k2c_leaky_relu_func;
+
+
+/**
+ * CELU (Continuously Differentiable ELU) activation function (default alpha=1.0).
+ *   y = max(0, x) + min(0, alpha * (exp(x/alpha) - 1))
+ *   With alpha=1.0: same as ELU with alpha=1.0.
+ *
+ * :param x: array of input values. Gets overwritten by output.
+ * :param size: length of input array.
+ */
+void k2c_celu_func(float * x, const size_t size) {
+
+    for (size_t i=0; i < size; ++i) {
+        if (x[i] < 0.0f) {
+            x[i] = expm1f(x[i]);
+        }
+    }
+}
+k2c_activationType * k2c_celu = k2c_celu_func;
+
+
+/**
+ * Hard tanh activation function.
+ *   y = clip(x, -1, 1)
+ *
+ * :param x: array of input values. Gets overwritten by output.
+ * :param size: length of input array.
+ */
+void k2c_hard_tanh_func(float * x, const size_t size) {
+
+    for (size_t i=0; i < size; ++i) {
+        if (x[i] < -1.0f) {
+            x[i] = -1.0f;
+        }
+        else if (x[i] > 1.0f) {
+            x[i] = 1.0f;
+        }
+    }
+}
+k2c_activationType * k2c_hard_tanh = k2c_hard_tanh_func;
+
+
+/**
+ * Hard shrink activation function (default lambda=0.5).
+ *   y = x    if |x| > 0.5
+ *   y = 0    otherwise
+ *
+ * :param x: array of input values. Gets overwritten by output.
+ * :param size: length of input array.
+ */
+void k2c_hard_shrink_func(float * x, const size_t size) {
+
+    for (size_t i=0; i < size; ++i) {
+        if (x[i] > -0.5f && x[i] < 0.5f) {
+            x[i] = 0.0f;
+        }
+    }
+}
+k2c_activationType * k2c_hard_shrink = k2c_hard_shrink_func;
+
+
+/**
+ * Soft shrink activation function (default lambda=0.5).
+ *   y = x - 0.5   if x >  0.5
+ *   y = x + 0.5   if x < -0.5
+ *   y = 0         otherwise
+ *
+ * :param x: array of input values. Gets overwritten by output.
+ * :param size: length of input array.
+ */
+void k2c_soft_shrink_func(float * x, const size_t size) {
+
+    for (size_t i=0; i < size; ++i) {
+        if (x[i] > 0.5f) {
+            x[i] = x[i] - 0.5f;
+        }
+        else if (x[i] < -0.5f) {
+            x[i] = x[i] + 0.5f;
+        }
+        else {
+            x[i] = 0.0f;
+        }
+    }
+}
+k2c_activationType * k2c_soft_shrink = k2c_soft_shrink_func;
+
+
+/**
+ * Squareplus activation function (default b=4).
+ *   y = (x + sqrt(x^2 + b)) / 2
+ *
+ * :param x: array of input values. Gets overwritten by output.
+ * :param size: length of input array.
+ */
+void k2c_squareplus_func(float * x, const size_t size) {
+
+    for (size_t i=0; i < size; ++i) {
+        x[i] = 0.5f * (x[i] + sqrtf(x[i] * x[i] + 4.0f));
+    }
+}
+k2c_activationType * k2c_squareplus = k2c_squareplus_func;
+
+
+/**
+ * Sparse plus activation function.
+ *   y = 0                              if x <= -sqrt(e)
+ *   y = (x + sqrt(e))^2 / (4*sqrt(e)) if -sqrt(e) < x < sqrt(e)
+ *   y = x                              if x >= sqrt(e)
+ *
+ * :param x: array of input values. Gets overwritten by output.
+ * :param size: length of input array.
+ */
+void k2c_sparse_plus_func(float * x, const size_t size) {
+
+    const float sqrte = 1.6487212707f;
+    const float inv_4sqrte = 1.0f / (4.0f * sqrte);
+    for (size_t i=0; i < size; ++i) {
+        if (x[i] <= -sqrte) {
+            x[i] = 0.0f;
         }
-        else if (x[i] < theta) {
-            x[i] = alpha*(x[i] - theta);
+        else if (x[i] < sqrte) {
+            float t = x[i] + sqrte;
+            x[i] = t * t * inv_4sqrte;
         }
     }
 }
+k2c_activationType * k2c_sparse_plus = k2c_sparse_plus_func;
diff --git a/include/k2c_conv_transpose_layer.c b/include/k2c_conv_transpose_layer.c
new file mode 100644
index 0000000..f206889
--- /dev/null
+++ b/include/k2c_conv_transpose_layer.c
@@ -0,0 +1,73 @@
+#include <string.h>
+#include "k2c_include.h"
+
+/**
+ * 1D (temporal) Convolution.
+ * Assumes a "channels last" structure.
+ *
+ * :param output: output tensor.
+ * :param input: input tensor.
+ * :param kernel: kernel tensor.
+ * :param bias: bias tensor.
+ * :param stride: stride length of the convolution.
+ * :param dilation: dilation rate to use for dilated convolution.
+ * :param activation: activation function to apply to output.
+ */
+void k2c_conv1d_transpose(k2c_tensor *output, const k2c_tensor *input,
+                          const k2c_tensor *kernel, const k2c_tensor *bias,
+                          const size_t stride, const size_t start_crop,
+                          k2c_activationType *activation)
+{
+    memset(output->array, 0, output->numel * sizeof(output->array[0]));
+
+    const size_t n_height = input->shape[0];
+    const size_t n_channels = input->shape[1];
+    const size_t k_size = kernel->shape[0];
+    const size_t n_filters = kernel->shape[1];
+    const size_t out_height = output->shape[0];
+
+    const size_t ker_dim12 = n_channels * n_filters;
+
+    size_t cs = 0;
+    size_t ce = 0;
+    size_t ts = 0;
+    size_t ks = 0;
+
+    for (size_t f = 0; f < n_filters; ++f)
+    {
+        for (size_t ch = 0; ch < n_channels; ++ch)
+        {
+            for (size_t t = 0; t < n_height; ++t)
+            {
+                ts = t * stride;
+                if (ts > start_crop)
+                {
+                    cs = ts - start_crop;
+                }
+                else
+                {
+                    cs = 0;
+                }
+                if (ts + k_size - start_crop > out_height)
+                {
+                    ce = out_height;
+                }
+                else
+                {
+                    ce = ts + k_size - start_crop;
+                }
+                ks = cs - (ts - start_crop);
+                for (size_t i = 0; i < ce - cs; ++i)
+                {
+                    output->array[(i + cs) * n_filters + f] +=
+                        kernel->array[(i + ks) * ker_dim12 + f * n_channels + ch] *
+                        input->array[t * n_channels + ch];
+                }
+            }
+        }
+    }
+    // }
+
+    k2c_bias_add(output, bias);
+    activation(output->array, output->numel);
+}
diff --git a/include/k2c_convolution_layers.c b/include/k2c_convolution_layers.c
index d44cc8d..5b76ce3 100644
--- a/include/k2c_convolution_layers.c
+++ b/include/k2c_convolution_layers.c
@@ -6,13 +6,11 @@ Licensed under MIT License
 https://github.com/f0uriest/keras2c
  */
 
-
 #include <math.h>
 #include <stdio.h>
 #include <string.h>
 #include "k2c_include.h"
 
-
 /**
  * 1D (temporal) Padding.
  *
@@ -21,30 +19,33 @@ Licensed under MIT License
  * :param fill: value to fill in padded areas.
  * :param pad: array[2] of how many rows to pad. Order is {before dim 1, after dim 1}.
  */
-void k2c_pad1d(k2c_tensor* output, const k2c_tensor* input, const float fill,
-               const size_t * pad) {
+void k2c_pad1d(k2c_tensor *output, const k2c_tensor *input, const float fill,
+               const size_t *pad)
+{
 
     const size_t in_width = input->shape[1];
     const size_t pad_top = pad[0];
 
     // set output array to fill value
-    if (fabs(fill) < 1e-6) {
+    if (fabs(fill) < 1e-6)
+    {
         // fill is ~zero, use memset
-        memset(output->array,0,output->numel*sizeof(output->array[0]));
+        memset(output->array, 0, output->numel * sizeof(output->array[0]));
     }
-    else {
-        for(size_t i=0; i<output->numel; ++i) {
+    else
+    {
+        for (size_t i = 0; i < output->numel; ++i)
+        {
             output->array[i] = fill;
         }
     }
 
     // memcpy the old array in the right place
-    const size_t offset = pad_top*in_width;
-    memcpy(&output->array[offset],&input->array[0],
-           input->numel*sizeof(input->array[0]));
+    const size_t offset = pad_top * in_width;
+    memcpy(&output->array[offset], &input->array[0],
+           input->numel * sizeof(input->array[0]));
 }
 
-
 /**
  * 2D (spatial) Padding.
  *
@@ -53,8 +54,9 @@ void k2c_pad1d(k2c_tensor* output, const k2c_tensor* input, const float fill,
  * :param fill: value to fill in padded areas.
  * :param pad: array[4] of how many rows/cols to pad. Order is {before dim 1, after dim 1, before dim 2, after dim 2}.
  */
-void k2c_pad2d(k2c_tensor* output, const k2c_tensor* input, const float fill,
-               const size_t * pad) {
+void k2c_pad2d(k2c_tensor *output, const k2c_tensor *input, const float fill,
+               const size_t *pad)
+{
 
     const size_t in_height = input->shape[0];
     const size_t in_width = input->shape[1];
@@ -64,29 +66,32 @@ void k2c_pad2d(k2c_tensor* output, const k2c_tensor* input, const float fill,
     const size_t pad_right = pad[3];
 
     // set output array to fill value
-    if (fabs(fill) < 1e-6) {
+    if (fabs(fill) < 1e-6)
+    {
         // fill is ~zero, use memset
-        memset(output->array,0,output->numel*sizeof(output->array[0]));
+        memset(output->array, 0, output->numel * sizeof(output->array[0]));
     }
-    else {
-        for(size_t i=0; i<output->numel; ++i) {
+    else
+    {
+        for (size_t i = 0; i < output->numel; ++i)
+        {
             output->array[i] = fill;
         }
     }
     // memcpy the old array in the middle
-    size_t offset = in_channels*(pad_left+pad_right+in_width)*pad_top +
-                    in_channels*pad_left;
-    const size_t num = in_channels*in_width;
-    const size_t step = num+in_channels*(pad_left+pad_right);
-    for (size_t i=0; i<in_height; ++i) {
+    size_t offset = in_channels * (pad_left + pad_right + in_width) * pad_top +
+                    in_channels * pad_left;
+    const size_t num = in_channels * in_width;
+    const size_t step = num + in_channels * (pad_left + pad_right);
+    for (size_t i = 0; i < in_height; ++i)
+    {
         memcpy(&output->array[offset],
-               &input->array[i*num],
-               num*sizeof(input->array[0]));
+               &input->array[i * num],
+               num * sizeof(input->array[0]));
         offset += step;
     }
 }
 
-
 /**
  * 3D (spatial or spatio-temporal) Padding.
  *
@@ -95,8 +100,9 @@ void k2c_pad2d(k2c_tensor* output, const k2c_tensor* input, const float fill,
  * :param fill: value to fill in padded areas.
  * :param pad: array[6] of how many rows/cols to pad. Order is {before dim 1, after dim 1, before dim 2, after dim 2, before dim 3, after dim 3}.
  */
-void k2c_pad3d(k2c_tensor* output, const k2c_tensor* input, const float fill,
-               const size_t * pad) {
+void k2c_pad3d(k2c_tensor *output, const k2c_tensor *input, const float fill,
+               const size_t *pad)
+{
 
     const size_t dim1 = input->shape[0];
     const size_t dim2 = input->shape[1];
@@ -107,33 +113,37 @@ void k2c_pad3d(k2c_tensor* output, const k2c_tensor* input, const float fill,
     const size_t in_channels = input->shape[3];
 
     // set output array to fill value
-    if (fabs(fill) < 1e-6) {
+    if (fabs(fill) < 1e-6)
+    {
         // fill is ~zero, use memset
-        memset(output->array,0,output->numel*sizeof(output->array[0]));
+        memset(output->array, 0, output->numel * sizeof(output->array[0]));
     }
-    else {
-        for(size_t i=0; i<output->numel; ++i) {
+    else
+    {
+        for (size_t i = 0; i < output->numel; ++i)
+        {
             output->array[i] = fill;
         }
     }
     // memcpy the old array in the middle
-    const size_t offset1 = in_channels*(outdim2*outdim3)*pad[0] + in_channels*outdim3*pad[2] + in_channels*pad[4];
-    const size_t num = in_channels*dim3;
-    const size_t outstep2 = num+in_channels*(pad[4]+pad[5]);
-    const size_t outstep1 = outdim2*outdim3*in_channels;
-    const size_t instep1 = dim2*dim3*in_channels;
-    const size_t instep2 = dim3*in_channels;
-
-    for (size_t i=0; i<dim1; ++i) {
-        for (size_t j=0; j<dim2; ++j) {
-            memcpy(&output->array[offset1+i*outstep1 + j*outstep2],
-                   &input->array[i*instep1+j*instep2],
-                   num*sizeof(input->array[0]));
+    const size_t offset1 = in_channels * (outdim2 * outdim3) * pad[0] + in_channels * outdim3 * pad[2] + in_channels * pad[4];
+    const size_t num = in_channels * dim3;
+    const size_t outstep2 = num + in_channels * (pad[4] + pad[5]);
+    const size_t outstep1 = outdim2 * outdim3 * in_channels;
+    const size_t instep1 = dim2 * dim3 * in_channels;
+    const size_t instep2 = dim3 * in_channels;
+
+    for (size_t i = 0; i < dim1; ++i)
+    {
+        for (size_t j = 0; j < dim2; ++j)
+        {
+            memcpy(&output->array[offset1 + i * outstep1 + j * outstep2],
+                   &input->array[i * instep1 + j * instep2],
+                   num * sizeof(input->array[0]));
         }
     }
 }
 
-
 /**
  * 1D (temporal) Convolution.
  * Assumes a "channels last" structure.
@@ -146,33 +156,37 @@ void k2c_pad3d(k2c_tensor* output, const k2c_tensor* input, const float fill,
  * :param dilation: dilation rate to use for dilated convolution.
  * :param activation: activation function to apply to output.
  */
-void k2c_conv1d(k2c_tensor* output, const k2c_tensor* input, const k2c_tensor* kernel,
-                const k2c_tensor* bias, const size_t stride, const size_t dilation,
-                k2c_activationType *activation) {
+void k2c_conv1d(k2c_tensor *output, const k2c_tensor *input, const k2c_tensor *kernel,
+                const k2c_tensor *bias, const size_t stride, const size_t dilation,
+                k2c_activationType *activation)
+{
 
-    memset(output->array,0,output->numel*sizeof(output->array[0]));
+    memset(output->array, 0, output->numel * sizeof(output->array[0]));
 
     const size_t out_times = output->shape[0];
     const size_t out_channels = output->shape[1];
     const size_t in_channels = input->shape[1];
 
-    for (size_t x0=0; x0 < out_times; ++x0) {
-        for (size_t z=0; z < kernel->shape[0]; ++z) {
-            for (size_t q=0; q < in_channels; ++q) {
-                for (size_t k=0; k < out_channels; ++k) {
-                    output->array[x0*out_channels + k] +=
-                        kernel->array[z*(kernel->shape[2]*kernel->shape[1]) +
-                                                                            q*(kernel->shape[2]) + k]*
-                        input->array[(x0*stride + dilation*z)*in_channels + q];
+    for (size_t x0 = 0; x0 < out_times; ++x0)
+    {
+        for (size_t z = 0; z < kernel->shape[0]; ++z)
+        {
+            for (size_t q = 0; q < in_channels; ++q)
+            {
+                for (size_t k = 0; k < out_channels; ++k)
+                {
+                    output->array[x0 * out_channels + k] +=
+                        kernel->array[z * (kernel->shape[2] * kernel->shape[1]) +
+                                      q * (kernel->shape[2]) + k] *
+                        input->array[(x0 * stride + dilation * z) * in_channels + q];
                 }
             }
         }
     }
-    k2c_bias_add(output,bias);
-    activation(output->array,output->numel);
+    k2c_bias_add(output, bias);
+    activation(output->array, output->numel);
 }
 
-
 /**
  * 2D (spatial) Convolution.
  * Assumes a "channels last" structure.
@@ -185,42 +199,43 @@ void k2c_conv1d(k2c_tensor* output, const k2c_tensor* input, const k2c_tensor* k
  * :param dilation: array[2] dilation rate to use for dilated convolution. Order is {dilation dim 1, dilation dim 2}.
  * :param activation: activation function to apply to output.
  */
-void k2c_conv2d(k2c_tensor* output, const k2c_tensor* input, const k2c_tensor* kernel,
-                const k2c_tensor* bias, const size_t * stride, const size_t * dilation,
-                k2c_activationType *activation) {
+void k2c_conv2d(k2c_tensor *output, const k2c_tensor *input, const k2c_tensor *kernel,
+                const k2c_tensor *bias, const size_t *stride, const size_t *dilation,
+                k2c_activationType *activation)
+{
 
-    memset(output->array,0,output->numel*sizeof(output->array[0]));
+    memset(output->array, 0, output->numel * sizeof(output->array[0]));
 
     const size_t out_rows = output->shape[0];
     const size_t out_cols = output->shape[1];
     const size_t out_channels = output->shape[2];
     const size_t in_channels = input->shape[2];
 
-    for (size_t x0=0; x0 < out_rows; ++x0) {
-        for (size_t x1=0; x1 < out_cols; ++x1) {
-            for (size_t z0=0; z0 < kernel->shape[0]; ++z0) {
-                for (size_t z1=0; z1 < kernel->shape[1]; ++z1) {
-                    for (size_t q=0; q < in_channels; ++q) {
-                        for (size_t k=0; k < out_channels; ++k) {
-                            output->array[x0*(output->shape[2]*output->shape[1])
-                                          + x1*(output->shape[2]) + k] +=
-                                              kernel->array[z0*(kernel->shape[3]*kernel->shape[2]*kernel->shape[1])
-                                                            + z1*(kernel->shape[3]*kernel->shape[2])
-                                                            + q*(kernel->shape[3]) + k]*
-                                              input->array[(x0*stride[0]
-                                                            + dilation[0]*z0)*(input->shape[2]*input->shape[1])
-                                                           + (x1*stride[1] + dilation[1]*z1)*(input->shape[2]) + q];
+    for (size_t x0 = 0; x0 < out_rows; ++x0)
+    {
+        for (size_t x1 = 0; x1 < out_cols; ++x1)
+        {
+            for (size_t z0 = 0; z0 < kernel->shape[0]; ++z0)
+            {
+                for (size_t z1 = 0; z1 < kernel->shape[1]; ++z1)
+                {
+                    for (size_t q = 0; q < in_channels; ++q)
+                    {
+                        for (size_t k = 0; k < out_channels; ++k)
+                        {
+                            output->array[x0 * (output->shape[2] * output->shape[1]) + x1 * (output->shape[2]) + k] +=
+                                kernel->array[z0 * (kernel->shape[3] * kernel->shape[2] * kernel->shape[1]) + z1 * (kernel->shape[3] * kernel->shape[2]) + q * (kernel->shape[3]) + k] *
+                                input->array[(x0 * stride[0] + dilation[0] * z0) * (input->shape[2] * input->shape[1]) + (x1 * stride[1] + dilation[1] * z1) * (input->shape[2]) + q];
                         }
                     }
                 }
             }
         }
     }
-    k2c_bias_add(output,bias);
-    activation(output->array,output->numel);
+    k2c_bias_add(output, bias);
+    activation(output->array, output->numel);
 }
 
-
 /**
  * 3D (spatial or spatio-temporal) Convolution.
  * Assumes a "channels last" structure.
@@ -233,41 +248,36 @@ void k2c_conv2d(k2c_tensor* output, const k2c_tensor* input, const k2c_tensor* k
  * :param dilation: array[3] dilation rate to use for dilated convolution. Order is {dilation dim 1, dilation dim 2, dilation dim 3}.
  * :param activation: activation function to apply to output.
  */
-void k2c_conv3d(k2c_tensor* output, const k2c_tensor* input, const k2c_tensor* kernel,
-                const k2c_tensor* bias, const size_t * stride, const size_t * dilation,
-                k2c_activationType *activation) {
+void k2c_conv3d(k2c_tensor *output, const k2c_tensor *input, const k2c_tensor *kernel,
+                const k2c_tensor *bias, const size_t *stride, const size_t *dilation,
+                k2c_activationType *activation)
+{
 
-    memset(output->array,0,output->numel*sizeof(output->array[0]));
+    memset(output->array, 0, output->numel * sizeof(output->array[0]));
     const size_t dim1 = output->shape[0];
     const size_t dim2 = output->shape[1];
     const size_t dim3 = output->shape[2];
     const size_t out_channels = output->shape[3];
     const size_t in_channels = input->shape[3];
 
-    for (size_t x0=0; x0 < dim1; ++x0) {
-        for (size_t x1=0; x1 < dim2; ++x1) {
-            for (size_t x2=0; x2<dim3; ++x2) {
-                for (size_t z0=0; z0 < kernel->shape[0]; ++z0) {
-                    for (size_t z1=0; z1 < kernel->shape[1]; ++z1) {
-                        for (size_t z2=0; z2 < kernel->shape[2]; ++z2) {
-                            for (size_t q=0; q < in_channels; ++q) {
-                                for (size_t k=0; k < out_channels; ++k) {
-                                    output->array[x0*(output->shape[3]*output->shape[2]
-                                                      *output->shape[1])
-                                                  + x1*(output->shape[3]*output->shape[2])
-                                                  + x2*(output->shape[3]) + k] +=
-                                                      kernel->array[z0*(kernel->shape[4]*kernel->shape[3]
-                                                                        *kernel->shape[2]*kernel->shape[1])
-                                                                    + z1*(kernel->shape[4]*kernel->shape[3]
-                                                                          *kernel->shape[2])
-                                                                    + z2*(kernel->shape[4]*kernel->shape[3])
-                                                                    + q*(kernel->shape[4]) + k]
-                                                      *input->array[(x0*stride[0] + dilation[0]*z0)
-                                                                    *(input->shape[3]*input->shape[2]*input->shape[1])
-                                                                    + (x1*stride[1] + dilation[1]*z1)
-                                                                    *(input->shape[3]*input->shape[2])
-                                                                    + (x2*stride[2] + dilation[2]*z2)
-                                                                    *(input->shape[3]) + q];
+    for (size_t x0 = 0; x0 < dim1; ++x0)
+    {
+        for (size_t x1 = 0; x1 < dim2; ++x1)
+        {
+            for (size_t x2 = 0; x2 < dim3; ++x2)
+            {
+                for (size_t z0 = 0; z0 < kernel->shape[0]; ++z0)
+                {
+                    for (size_t z1 = 0; z1 < kernel->shape[1]; ++z1)
+                    {
+                        for (size_t z2 = 0; z2 < kernel->shape[2]; ++z2)
+                        {
+                            for (size_t q = 0; q < in_channels; ++q)
+                            {
+                                for (size_t k = 0; k < out_channels; ++k)
+                                {
+                                    output->array[x0 * (output->shape[3] * output->shape[2] * output->shape[1]) + x1 * (output->shape[3] * output->shape[2]) + x2 * (output->shape[3]) + k] +=
+                                        kernel->array[z0 * (kernel->shape[4] * kernel->shape[3] * kernel->shape[2] * kernel->shape[1]) + z1 * (kernel->shape[4] * kernel->shape[3] * kernel->shape[2]) + z2 * (kernel->shape[4] * kernel->shape[3]) + q * (kernel->shape[4]) + k] * input->array[(x0 * stride[0] + dilation[0] * z0) * (input->shape[3] * input->shape[2] * input->shape[1]) + (x1 * stride[1] + dilation[1] * z1) * (input->shape[3] * input->shape[2]) + (x2 * stride[2] + dilation[2] * z2) * (input->shape[3]) + q];
                                 }
                             }
                         }
@@ -276,11 +286,10 @@ void k2c_conv3d(k2c_tensor* output, const k2c_tensor* input, const k2c_tensor* k
             }
         }
     }
-    k2c_bias_add(output,bias);
-    activation(output->array,output->numel);
+    k2c_bias_add(output, bias);
+    activation(output->array, output->numel);
 }
 
-
 /**
  * 1D (temporal) Cropping.
  *
@@ -288,14 +297,14 @@ void k2c_conv3d(k2c_tensor* output, const k2c_tensor* input, const k2c_tensor* k
  * :param input: tensor to crop.
  * :param pad: array[2] of how many rows to crop. Order is {before dim 1, after dim 1}.
  */
-void k2c_crop1d(k2c_tensor* output, const k2c_tensor* input, const size_t * crop) {
+void k2c_crop1d(k2c_tensor *output, const k2c_tensor *input, const size_t *crop)
+{
 
-    const size_t offset = crop[0]*input->shape[1];
-    memcpy(&output->array[0],&input->array[offset],
-           output->numel*sizeof(output->array[0]));
+    const size_t offset = crop[0] * input->shape[1];
+    memcpy(&output->array[0], &input->array[offset],
+           output->numel * sizeof(output->array[0]));
 }
 
-
 /**
  * 2D (spatial) Cropping.
  *
@@ -303,7 +312,8 @@ void k2c_crop1d(k2c_tensor* output, const k2c_tensor* input, const size_t * crop
  * :param input: tensor to crop.
  * :param pad: array[4] of how many rows/cols to crop. Order is {before dim 1, after dim 1, before dim 2, after dim 2}.
  */
-void k2c_crop2d(k2c_tensor* output, const k2c_tensor* input, const size_t * crop) {
+void k2c_crop2d(k2c_tensor *output, const k2c_tensor *input, const size_t *crop)
+{
 
     const size_t out_height = output->shape[0];
     const size_t in_width = input->shape[1];
@@ -312,15 +322,15 @@ void k2c_crop2d(k2c_tensor* output, const k2c_tensor* input, const size_t * crop
     const size_t crop_left = crop[2];
     const size_t crop_right = crop[3];
 
-    size_t offset = in_channels*in_width*crop_top + in_channels*crop_left;
-    const size_t num = in_channels*(in_width-crop_left-crop_right);
-    for (size_t i=0; i<out_height; ++i) {
-        memcpy(&output->array[i*num],&input->array[offset],num*sizeof(input->array[0]));
-        offset += in_width*in_channels;
+    size_t offset = in_channels * in_width * crop_top + in_channels * crop_left;
+    const size_t num = in_channels * (in_width - crop_left - crop_right);
+    for (size_t i = 0; i < out_height; ++i)
+    {
+        memcpy(&output->array[i * num], &input->array[offset], num * sizeof(input->array[0]));
+        offset += in_width * in_channels;
     }
 }
 
-
 /**
  * 3D (spatial or spatio-temporal) Cropping.
  *
@@ -328,7 +338,8 @@ void k2c_crop2d(k2c_tensor* output, const k2c_tensor* input, const size_t * crop
  * :param input: tensor to crop.
  * :param pad: array[6] of how many rows/cols to crop. Order is {before dim 1, after dim 1, before dim 2, after dim 2, before dim 3, after dim 3}.
  */
-void k2c_crop3d(k2c_tensor* output, const k2c_tensor* input, const size_t * crop) {
+void k2c_crop3d(k2c_tensor *output, const k2c_tensor *input, const size_t *crop)
+{
 
     const size_t dim1 = input->shape[0];
     const size_t dim2 = input->shape[1];
@@ -338,24 +349,25 @@ void k2c_crop3d(k2c_tensor* output, const k2c_tensor* input, const size_t * crop
     const size_t outdim3 = dim3 - crop[4] - crop[5];
     const size_t in_channels = input->shape[3];
 
-    const size_t offset1 = in_channels*(dim2*dim3)*crop[0] +
-                           in_channels*dim3*crop[2] + in_channels*crop[4];
-    const size_t num = in_channels*outdim3;
-    const size_t instep2 = num+in_channels*(crop[4]+crop[5]);
-    const size_t instep1 = dim2*dim3*in_channels;
-    const size_t outstep1 = outdim2*outdim3*in_channels;
-    const size_t outstep2 = outdim3*in_channels;
-
-    for (size_t i=0; i<outdim1; ++i) {
-        for (size_t j=0; j<outdim2; ++j) {
-            memcpy(&output->array[i*outstep1 + j*outstep2],
-                   &input->array[offset1+i*instep1+j*instep2],
-                   num*sizeof(input->array[0]));
+    const size_t offset1 = in_channels * (dim2 * dim3) * crop[0] +
+                           in_channels * dim3 * crop[2] + in_channels * crop[4];
+    const size_t num = in_channels * outdim3;
+    const size_t instep2 = num + in_channels * (crop[4] + crop[5]);
+    const size_t instep1 = dim2 * dim3 * in_channels;
+    const size_t outstep1 = outdim2 * outdim3 * in_channels;
+    const size_t outstep2 = outdim3 * in_channels;
+
+    for (size_t i = 0; i < outdim1; ++i)
+    {
+        for (size_t j = 0; j < outdim2; ++j)
+        {
+            memcpy(&output->array[i * outstep1 + j * outstep2],
+                   &input->array[offset1 + i * instep1 + j * instep2],
+                   num * sizeof(input->array[0]));
         }
     }
 }
 
-
 /**
  * 1D (temporal) Upsampling.
  * Repeats each temporal step size times along the time axis.
@@ -364,21 +376,24 @@ void k2c_crop3d(k2c_tensor* output, const k2c_tensor* input, const size_t * crop
  * :param input: input tensor.
  * :param size: Upsampling factor.
  */
-void k2c_upsampling1d(k2c_tensor* output, const k2c_tensor* input, const size_t size) {
+void k2c_upsampling1d(k2c_tensor *output, const k2c_tensor *input, const size_t size)
+{
 
     const size_t in_height = input->shape[0];
     const size_t in_width = input->shape[1];
 
-    for (size_t i=0; i<in_height; ++i) {
-        for (size_t j=0; j<size; ++j) {
-            for (size_t k=0; k<in_width; ++k) {
-                output->array[(size*i+j)*in_width + k] = input->array[i*in_width+k];
+    for (size_t i = 0; i < in_height; ++i)
+    {
+        for (size_t j = 0; j < size; ++j)
+        {
+            for (size_t k = 0; k < in_width; ++k)
+            {
+                output->array[(size * i + j) * in_width + k] = input->array[i * in_width + k];
             }
         }
     }
 }
 
-
 /**
  * 2D (spatial) Upsampling.
  * Repeats the rows and columns of the data by size[0] and size[1] respectively.
@@ -387,24 +402,26 @@ void k2c_upsampling1d(k2c_tensor* output, const k2c_tensor* input, const size_t
  * :param input: input tensor.
  * :param size: array[2] of upsampling factors. Order is {upsampling dim 1, upsampling dim 2}.
  */
-void k2c_upsampling2d(k2c_tensor* output, const k2c_tensor* input, const size_t * size) {
+void k2c_upsampling2d(k2c_tensor *output, const k2c_tensor *input, const size_t *size)
+{
 
     const size_t out_height = output->shape[0];
     const size_t out_width = output->shape[1];
     const size_t channels = input->shape[2];
 
-    for (size_t i=0; i<out_height; ++i) {
-        for (size_t j=0; j<out_width; ++j) {
-            const size_t insub[K2C_MAX_NDIM] = {i/size[0],j/size[1],0};
-            const size_t outsub[K2C_MAX_NDIM] = {i,j,0};
-            memcpy(&output->array[k2c_sub2idx(outsub,output->shape,output->ndim)],
-                   &input->array[k2c_sub2idx(insub,input->shape,input->ndim)],
-                   channels*sizeof(input->array[0]));
+    for (size_t i = 0; i < out_height; ++i)
+    {
+        for (size_t j = 0; j < out_width; ++j)
+        {
+            const size_t insub[K2C_MAX_NDIM] = {i / size[0], j / size[1], 0};
+            const size_t outsub[K2C_MAX_NDIM] = {i, j, 0};
+            memcpy(&output->array[k2c_sub2idx(outsub, output->shape, output->ndim)],
+                   &input->array[k2c_sub2idx(insub, input->shape, input->ndim)],
+                   channels * sizeof(input->array[0]));
         }
     }
 }
 
-
 /**
  * 2D (spatial) Upsampling.
  * Repeats the 1st, 2nd and 3rd dimensions of the data by size[0], size[1] and size[2] respectively.
@@ -413,21 +430,25 @@ void k2c_upsampling2d(k2c_tensor* output, const k2c_tensor* input, const size_t
  * :param input: input tensor.
  * :param size: array[3] of upsampling factors. Order is {upsampling dim 1, upsampling dim 2, upsampling dim 3}.
  */
-void k2c_upsampling3d(k2c_tensor* output, const k2c_tensor* input, const size_t * size) {
+void k2c_upsampling3d(k2c_tensor *output, const k2c_tensor *input, const size_t *size)
+{
 
     const size_t dim1 = output->shape[0];
     const size_t dim2 = output->shape[1];
     const size_t dim3 = output->shape[2];
     const size_t channels = input->shape[3];
 
-    for (size_t i=0; i<dim1; ++i) {
-        for (size_t j=0; j<dim2; ++j) {
-            for (size_t k=0; k<dim3; ++k) {
-                const size_t insub[K2C_MAX_NDIM] = {i/size[0],j/size[1],k/size[2],0};
-                const size_t outsub[K2C_MAX_NDIM] = {i,j,k,0};
-                memcpy(&output->array[k2c_sub2idx(outsub,output->shape,output->ndim)],
-                       &input->array[k2c_sub2idx(insub,input->shape,input->ndim)],
-                       channels*sizeof(input->array[0]));
+    for (size_t i = 0; i < dim1; ++i)
+    {
+        for (size_t j = 0; j < dim2; ++j)
+        {
+            for (size_t k = 0; k < dim3; ++k)
+            {
+                const size_t insub[K2C_MAX_NDIM] = {i / size[0], j / size[1], k / size[2], 0};
+                const size_t outsub[K2C_MAX_NDIM] = {i, j, k, 0};
+                memcpy(&output->array[k2c_sub2idx(outsub, output->shape, output->ndim)],
+                       &input->array[k2c_sub2idx(insub, input->shape, input->ndim)],
+                       channels * sizeof(input->array[0]));
             }
         }
     }
diff --git a/include/k2c_helper_functions.c b/include/k2c_helper_functions.c
index 05e0042..7d836d4 100644
--- a/include/k2c_helper_functions.c
+++ b/include/k2c_helper_functions.c
@@ -25,7 +25,8 @@ Licensed under MIT License
  * :param outcols: number of cols of C and B.
  * :param innderdim: number of cols of A and rows of B
  */
-void k2c_matmul(float * C, const float * A, const float * B, const size_t outrows,
+__attribute__((hot))
+void k2c_matmul(float * __restrict C, const float * __restrict A, const float * __restrict B, const size_t outrows,
                 const size_t outcols, const size_t innerdim) {
 
     // make sure output is empty
@@ -57,20 +58,28 @@ void k2c_matmul(float * C, const float * A, const float * B, const size_t outrow
  * :param outcols: number of cols of C, B and d.
  * :param innderdim: number of cols of A and rows of B
  */
-void k2c_affine_matmul(float * C, const float * A, const float * B, const float * d,
+__attribute__((hot))
+void k2c_affine_matmul(float * __restrict C, const float * __restrict A, const float * __restrict B, const float * __restrict d,
                        const size_t outrows,const size_t outcols, const size_t innerdim) {
 
-    // make sure output is empty
-    memset(C, 0, outrows*outcols*sizeof(C[0]));
+    // Initialize C with bias vector (fuses memset + bias addition)
+    for (size_t i = 0; i < outrows; ++i) {
+        const size_t outrowidx = i * outcols;
+        for (size_t j = 0; j < outcols; ++j) {
+            C[outrowidx + j] = d[j];
+        }
+    }
 
+    // matmul in i->k->j order for cache-friendly sequential access to B and C
     for (size_t i = 0 ; i < outrows; ++i) {
         const size_t outrowidx = i*outcols;
         const size_t inneridx = i*innerdim;
-        for (size_t j = 0;  j < outcols; ++j) {
-            for (size_t k = 0; k < innerdim; ++k) {
-                C[outrowidx+j] += A[inneridx+k] * B[k*outcols+j];
+        for (size_t k = 0; k < innerdim; ++k) {
+            const float Aik = A[inneridx+k];
+            const size_t Bidx = k*outcols;
+            for (size_t j = 0;  j < outcols; ++j) {
+                C[outrowidx+j] += Aik * B[Bidx+j];
             }
-            C[outrowidx+j] += d[j];
         }
     }
 }
@@ -129,8 +138,9 @@ void k2c_idx2sub(const size_t idx, size_t * sub, const size_t * shape, const siz
  * :param normalize: (0,1) whether to L2-normalize samples along the dot product axis before taking the dot product. If set to 1, then the output of the dot product is the cosine proximity between the two samples.
  * :param fwork: array of working space, size(fwork) = size(A) + size(B)
  */
-void k2c_dot(k2c_tensor* C, const k2c_tensor* A, const k2c_tensor* B, const size_t * axesA,
-             const size_t * axesB, const size_t naxes, const int normalize, float * fwork) {
+__attribute__((hot))
+void k2c_dot(k2c_tensor* __restrict C, const k2c_tensor* __restrict A, const k2c_tensor* __restrict B, const size_t * __restrict axesA,
+             const size_t * __restrict axesB, const size_t naxes, const int normalize, float * __restrict fwork) {
 
     size_t permA[K2C_MAX_NDIM];
     size_t permB[K2C_MAX_NDIM];
@@ -268,7 +278,7 @@ void k2c_dot(k2c_tensor* C, const k2c_tensor* A, const k2c_tensor* B, const size
  * :param A: input tensor. Overwritten with outputs.
  * :param b: bias tensor.
  */
-void k2c_bias_add(k2c_tensor* A, const k2c_tensor* b) {
+void k2c_bias_add(k2c_tensor* __restrict A, const k2c_tensor* __restrict b) {
 
     for (size_t i=0; i<A->numel; i+=b->numel) {
         for (size_t j=0; j<b->numel; ++j) {
@@ -286,7 +296,7 @@ void k2c_bias_add(k2c_tensor* A, const k2c_tensor* b) {
  * :param axis: axis along which to flip
  */
 
-void k2c_flip(k2c_tensor *A, const size_t axis) {
+void k2c_flip(k2c_tensor * __restrict A, const size_t axis) {
     const size_t ndim = A->ndim;
     const size_t * shape = A->shape;
     const size_t numel = A->numel;
diff --git a/include/k2c_include.h b/include/k2c_include.h
index 24347af..4ff069b 100644
--- a/include/k2c_include.h
+++ b/include/k2c_include.h
@@ -2,8 +2,10 @@
 k2c_include.h
 This file is part of keras2c
 Copyright 2020 Rory Conlin
-Licensed under MIT License
+Licensed under LGPLv3 License
 https://github.com/f0uriest/keras2c
+
+Modified by Anchal Gupta
  */
 
 #pragma once
@@ -11,137 +13,184 @@ Licensed under MIT License
 #include <stdlib.h>
 #include "k2c_tensor_include.h"
 
+/* File-scope scratch buffers in generated code are mutable shared state.
+ * If two threads call the inference function concurrently they would
+ * trample each other. K2C_THREAD_LOCAL gives each thread its own copy.
+ * GCC and Clang support __thread under -std=c99; for unknown compilers
+ * we fall back to nothing and warn so the user can decide.
+ */
+#if defined(__GNUC__) || defined(__clang__)
+#define K2C_THREAD_LOCAL __thread
+#else
+#warning "K2C_THREAD_LOCAL: unknown compiler; generated scratch buffers are not thread-local"
+#define K2C_THREAD_LOCAL
+#endif
 
 // Activations
-void k2c_linear_func(float * x, const size_t size);
-void k2c_exponential_func(float * x, const size_t size);
-void k2c_relu_func(float * x, const size_t size);
-void k2c_hard_sigmoid_func(float * x, const size_t size);
-void k2c_tanh_func(float * x, const size_t size);
-void k2c_sigmoid_func(float * x, const size_t size);
-void k2c_softmax_func(float * x, const size_t size);
-void k2c_softplus_func(float * x, const size_t size);
-void k2c_softsign_func(float * x, const size_t size);
-typedef void k2c_activationType(float * x, const size_t size);
-extern k2c_activationType * k2c_linear;
-extern k2c_activationType * k2c_exponential;
-extern k2c_activationType * k2c_relu;
-extern k2c_activationType * k2c_hard_sigmoid;
-extern k2c_activationType * k2c_tanh;
-extern k2c_activationType * k2c_sigmoid;
-extern k2c_activationType * k2c_softmax;
-extern k2c_activationType * k2c_softplus;
-extern k2c_activationType * k2c_softsign;
+void k2c_linear_func(float *x, const size_t size);
+void k2c_exponential_func(float *x, const size_t size);
+void k2c_relu_func(float *x, const size_t size);
+void k2c_hard_sigmoid_func(float *x, const size_t size);
+void k2c_tanh_func(float *x, const size_t size);
+void k2c_sigmoid_func(float *x, const size_t size);
+void k2c_swish_func(float *x, const size_t size);
+void k2c_softmax_func(float *x, const size_t size);
+void k2c_softplus_func(float *x, const size_t size);
+void k2c_softsign_func(float *x, const size_t size);
+void k2c_selu_func(float *x, const size_t size);
+void k2c_elu_func(float *x, const size_t size);
+void k2c_gelu_func(float *x, const size_t size);
+void k2c_hard_silu_func(float *x, const size_t size);
+void k2c_mish_func(float *x, const size_t size);
+void k2c_relu6_func(float *x, const size_t size);
+void k2c_log_softmax_func(float *x, const size_t size);
+void k2c_leaky_relu_func(float *x, const size_t size);
+void k2c_celu_func(float *x, const size_t size);
+void k2c_hard_tanh_func(float *x, const size_t size);
+void k2c_hard_shrink_func(float *x, const size_t size);
+void k2c_soft_shrink_func(float *x, const size_t size);
+void k2c_squareplus_func(float *x, const size_t size);
+void k2c_sparse_plus_func(float *x, const size_t size);
+typedef void k2c_activationType(float *x, const size_t size);
+extern k2c_activationType *k2c_linear;
+extern k2c_activationType *k2c_exponential;
+extern k2c_activationType *k2c_relu;
+extern k2c_activationType *k2c_swish;
+extern k2c_activationType *k2c_hard_sigmoid;
+extern k2c_activationType *k2c_tanh;
+extern k2c_activationType *k2c_sigmoid;
+extern k2c_activationType *k2c_softmax;
+extern k2c_activationType *k2c_softplus;
+extern k2c_activationType *k2c_softsign;
+extern k2c_activationType *k2c_selu;
+extern k2c_activationType *k2c_elu;
+extern k2c_activationType *k2c_gelu;
+extern k2c_activationType *k2c_hard_silu;
+extern k2c_activationType *k2c_mish;
+extern k2c_activationType *k2c_relu6;
+extern k2c_activationType *k2c_log_softmax;
+extern k2c_activationType *k2c_leaky_relu;
+extern k2c_activationType *k2c_celu;
+extern k2c_activationType *k2c_hard_tanh;
+extern k2c_activationType *k2c_hard_shrink;
+extern k2c_activationType *k2c_soft_shrink;
+extern k2c_activationType *k2c_squareplus;
+extern k2c_activationType *k2c_sparse_plus;
 
 // Advanced Activations
-void k2c_LeakyReLU(float * x, const size_t size, const float alpha);
-void k2c_PReLU(float * x, const size_t size, const float * alpha);
-void k2c_ELU(float * x, const size_t size, const float alpha);
-void k2c_ThresholdedReLU(float * x, const size_t size, const float theta);
-void k2c_ReLU(float * x, const size_t size, const float max_value, const float negative_slope,
+void k2c_LeakyReLU(float *x, const size_t size, const float alpha);
+void k2c_PReLU(float *x, const size_t size, const float *alpha);
+void k2c_ELU(float *x, const size_t size, const float alpha);
+void k2c_ThresholdedReLU(float *x, const size_t size, const float theta);
+void k2c_ReLU(float *x, const size_t size, const float max_value, const float negative_slope,
               const float threshold);
 
 // Convolutions
-void k2c_pad1d(k2c_tensor* output, const k2c_tensor* input, const float fill,
-               const size_t * pad);
-void k2c_pad2d(k2c_tensor* output, const k2c_tensor* input, const float fill,
-               const size_t * pad);
-void k2c_pad3d(k2c_tensor* output, const k2c_tensor* input, const float fill,
-               const size_t * pad);
-void k2c_conv1d(k2c_tensor* output, const k2c_tensor* input, const k2c_tensor* kernel,
-                const k2c_tensor* bias, const size_t stride, const size_t dilation,
+void k2c_pad1d(k2c_tensor *output, const k2c_tensor *input, const float fill,
+               const size_t *pad);
+void k2c_pad2d(k2c_tensor *output, const k2c_tensor *input, const float fill,
+               const size_t *pad);
+void k2c_pad3d(k2c_tensor *output, const k2c_tensor *input, const float fill,
+               const size_t *pad);
+void k2c_conv1d(k2c_tensor *output, const k2c_tensor *input, const k2c_tensor *kernel,
+                const k2c_tensor *bias, const size_t stride, const size_t dilation,
                 k2c_activationType *activation);
-void k2c_conv2d(k2c_tensor* output, const k2c_tensor* input, const k2c_tensor* kernel,
-                const k2c_tensor* bias, const size_t * stride, const size_t * dilation,
+void k2c_conv2d(k2c_tensor *output, const k2c_tensor *input, const k2c_tensor *kernel,
+                const k2c_tensor *bias, const size_t *stride, const size_t *dilation,
                 k2c_activationType *activation);
-void k2c_conv3d(k2c_tensor* output, const k2c_tensor* input, const k2c_tensor* kernel,
-                const k2c_tensor* bias, const size_t * stride, const size_t * dilation,
+void k2c_conv3d(k2c_tensor *output, const k2c_tensor *input, const k2c_tensor *kernel,
+                const k2c_tensor *bias, const size_t *stride, const size_t *dilation,
                 k2c_activationType *activation);
-void k2c_crop1d(k2c_tensor* output, const k2c_tensor* input, const size_t * crop);
-void k2c_crop2d(k2c_tensor* output, const k2c_tensor* input, const size_t * crop);
-void k2c_crop3d(k2c_tensor* output, const k2c_tensor* input, const size_t * crop);
-void k2c_upsampling1d(k2c_tensor* output, const k2c_tensor* input, const size_t size);
-void k2c_upsampling2d(k2c_tensor* output, const k2c_tensor* input, const size_t * size);
-void k2c_upsampling3d(k2c_tensor* output, const k2c_tensor* input, const size_t * size);
+void k2c_crop1d(k2c_tensor *output, const k2c_tensor *input, const size_t *crop);
+void k2c_crop2d(k2c_tensor *output, const k2c_tensor *input, const size_t *crop);
+void k2c_crop3d(k2c_tensor *output, const k2c_tensor *input, const size_t *crop);
+void k2c_upsampling1d(k2c_tensor *output, const k2c_tensor *input, const size_t size);
+void k2c_upsampling2d(k2c_tensor *output, const k2c_tensor *input, const size_t *size);
+void k2c_upsampling3d(k2c_tensor *output, const k2c_tensor *input, const size_t *size);
+void k2c_conv1d_transpose(k2c_tensor *output, const k2c_tensor *input, const k2c_tensor *kernel,
+                          const k2c_tensor *bias, const size_t stride, const size_t start_crop,
+                          k2c_activationType *activation);
 
 // Core Layers
-void k2c_dense(k2c_tensor* output, const k2c_tensor* input, const k2c_tensor* kernel,
-               const k2c_tensor* bias, k2c_activationType *activation, float * fwork);
-void k2c_flatten(k2c_tensor *output, const k2c_tensor* input);
-void k2c_reshape(k2c_tensor *output, const k2c_tensor* input, const size_t * newshp,
+void k2c_dense(k2c_tensor *output, const k2c_tensor *input, const k2c_tensor *kernel,
+               const k2c_tensor *bias, k2c_activationType *activation, float *fwork);
+void k2c_flatten(k2c_tensor *output, const k2c_tensor *input);
+void k2c_reshape(k2c_tensor *output, const k2c_tensor *input, const size_t *newshp,
                  const size_t newndim);
-void k2c_permute_dims(k2c_tensor* output, const k2c_tensor* input,
-                      const size_t * permute);
-void k2c_repeat_vector(k2c_tensor* output, const k2c_tensor* input, const size_t n);
+void k2c_permute_dims(k2c_tensor *output, const k2c_tensor *input,
+                      const size_t *permute);
+void k2c_repeat_vector(k2c_tensor *output, const k2c_tensor *input, const size_t n);
 
 // Embedding
-void k2c_embedding(k2c_tensor* outputs, const k2c_tensor* inputs, const k2c_tensor* kernel);
+void k2c_embedding(k2c_tensor *outputs, const k2c_tensor *inputs, const k2c_tensor *kernel);
 
 // Helper functions
-void k2c_matmul(float * C, const float * A, const float * B, const size_t outrows,
+void k2c_matmul(float *C, const float *A, const float *B, const size_t outrows,
                 const size_t outcols, const size_t innerdim);
-void k2c_affine_matmul(float * C, const float * A, const float * B, const float * d,
-                       const size_t outrows,const size_t outcols, const size_t innerdim);
-size_t k2c_sub2idx(const size_t * sub, const size_t * shape, const size_t ndim);
-void k2c_idx2sub(const size_t idx, size_t * sub, const size_t * shape, const size_t ndim);
-void k2c_dot(k2c_tensor* C, const k2c_tensor* A, const k2c_tensor* B, const size_t * axesA,
-             const size_t * axesB, const size_t naxes, const int normalize, float * fwork);
-void k2c_bias_add(k2c_tensor* A, const k2c_tensor* b);
+void k2c_affine_matmul(float *C, const float *A, const float *B, const float *d,
+                       const size_t outrows, const size_t outcols, const size_t innerdim);
+size_t k2c_sub2idx(const size_t *sub, const size_t *shape, const size_t ndim);
+void k2c_idx2sub(const size_t idx, size_t *sub, const size_t *shape, const size_t ndim);
+void k2c_dot(k2c_tensor *C, const k2c_tensor *A, const k2c_tensor *B, const size_t *axesA,
+             const size_t *axesB, const size_t naxes, const int normalize, float *fwork);
+void k2c_bias_add(k2c_tensor *A, const k2c_tensor *b);
 void k2c_flip(k2c_tensor *A, const size_t axis);
-float* k2c_read_array(const char* filename, const size_t array_size);
+float *k2c_read_array(const char *filename, const size_t array_size);
 
 // Merge layers
-void k2c_add(k2c_tensor* output, const size_t num_tensors,...);
-void k2c_subtract(k2c_tensor* output, const size_t num_tensors,
-                  const k2c_tensor* tensor1, const k2c_tensor* tensor2);
-void k2c_multiply(k2c_tensor* output, const size_t num_tensors,...);
-void k2c_average(k2c_tensor* output, const size_t num_tensors,...);
-void k2c_max(k2c_tensor* output, const size_t num_tensors,...);
-void k2c_min(k2c_tensor* output, const size_t num_tensors,...);
-void k2c_concatenate(k2c_tensor* output, const size_t axis, const size_t num_tensors,...);
+void k2c_add(k2c_tensor *output, const size_t num_tensors, ...);
+void k2c_subtract(k2c_tensor *output, const size_t num_tensors,
+                  const k2c_tensor *tensor1, const k2c_tensor *tensor2);
+void k2c_multiply(k2c_tensor *output, const size_t num_tensors, ...);
+void k2c_average(k2c_tensor *output, const size_t num_tensors, ...);
+void k2c_max(k2c_tensor *output, const size_t num_tensors, ...);
+void k2c_min(k2c_tensor *output, const size_t num_tensors, ...);
+void k2c_concatenate(k2c_tensor *output, const size_t axis, const size_t num_tensors, ...);
+
+// Split layers
+void k2c_split(k2c_tensor *output, k2c_tensor *input, size_t offset);
 
 // Normalization layers
-void k2c_batch_norm(k2c_tensor* outputs, const k2c_tensor* inputs, const k2c_tensor* mean,
-                    const k2c_tensor* stdev, const k2c_tensor* gamma, const k2c_tensor* beta,
+void k2c_batch_norm(k2c_tensor *outputs, const k2c_tensor *inputs, const k2c_tensor *mean,
+                    const k2c_tensor *stdev, const k2c_tensor *gamma, const k2c_tensor *beta,
                     const size_t axis);
 
 // Pooling layers
-void k2c_global_max_pooling(k2c_tensor* output, const k2c_tensor* input);
-void k2c_global_avg_pooling(k2c_tensor* output, const k2c_tensor* input);
-void k2c_maxpool1d(k2c_tensor* output, const k2c_tensor* input, const size_t pool_size,
+void k2c_global_max_pooling(k2c_tensor *output, const k2c_tensor *input);
+void k2c_global_avg_pooling(k2c_tensor *output, const k2c_tensor *input);
+void k2c_maxpool1d(k2c_tensor *output, const k2c_tensor *input, const size_t pool_size,
                    const size_t stride);
-void k2c_maxpool2d(k2c_tensor* output, const k2c_tensor* input, const size_t * pool_size,
-                   const size_t * stride);
-void k2c_avgpool1d(k2c_tensor* output, const k2c_tensor* input, const size_t pool_size,
+void k2c_maxpool2d(k2c_tensor *output, const k2c_tensor *input, const size_t *pool_size,
+                   const size_t *stride);
+void k2c_avgpool1d(k2c_tensor *output, const k2c_tensor *input, const size_t pool_size,
                    const size_t stride);
-void k2c_avgpool2d(k2c_tensor* output, const k2c_tensor* input, const size_t * pool_size,
-                   const size_t * stride);
+void k2c_avgpool2d(k2c_tensor *output, const k2c_tensor *input, const size_t *pool_size,
+                   const size_t *stride);
 
 // Recurrent layers
-void k2c_lstmcell(float * state, const float * input, const k2c_tensor* kernel,
-                  const k2c_tensor* recurrent_kernel, const k2c_tensor* bias, float * fwork,
+void k2c_lstmcell(float *state, const float *input, const k2c_tensor *kernel,
+                  const k2c_tensor *recurrent_kernel, const k2c_tensor *bias, float *fwork,
                   k2c_activationType *recurrent_activation,
                   k2c_activationType *output_activation);
-void k2c_lstm(k2c_tensor* output, const k2c_tensor* input, float * state,
-              const k2c_tensor* kernel, const k2c_tensor* recurrent_kernel,
-              const k2c_tensor* bias, float * fwork, const int go_backwards,
+void k2c_lstm(k2c_tensor *output, const k2c_tensor *input, float *state,
+              const k2c_tensor *kernel, const k2c_tensor *recurrent_kernel,
+              const k2c_tensor *bias, float *fwork, const int go_backwards,
               const int return_sequences, k2c_activationType *recurrent_activation,
               k2c_activationType *output_activation);
-void k2c_simpleRNNcell(float * state, const float * input, const k2c_tensor* kernel,
-                       const k2c_tensor* recurrent_kernel, const k2c_tensor* bias,
-                       float * fwork, k2c_activationType *output_activation);
-void k2c_simpleRNN(k2c_tensor* output, const k2c_tensor* input, float * state,
-                   const k2c_tensor* kernel, const k2c_tensor* recurrent_kernel,
-                   const k2c_tensor* bias, float * fwork, const int go_backwards,
+void k2c_simpleRNNcell(float *state, const float *input, const k2c_tensor *kernel,
+                       const k2c_tensor *recurrent_kernel, const k2c_tensor *bias,
+                       float *fwork, k2c_activationType *output_activation);
+void k2c_simpleRNN(k2c_tensor *output, const k2c_tensor *input, float *state,
+                   const k2c_tensor *kernel, const k2c_tensor *recurrent_kernel,
+                   const k2c_tensor *bias, float *fwork, const int go_backwards,
                    const int return_sequences, k2c_activationType *output_activation);
-void k2c_grucell(float * state, const float * input, const k2c_tensor* kernel,
-                 const k2c_tensor* recurrent_kernel, const k2c_tensor* bias, float * fwork,
+void k2c_grucell(float *state, const float *input, const k2c_tensor *kernel,
+                 const k2c_tensor *recurrent_kernel, const k2c_tensor *bias, float *fwork,
                  const int reset_after, k2c_activationType *recurrent_activation,
                  k2c_activationType *output_activation);
-void k2c_gru(k2c_tensor* output, const k2c_tensor* input, float * state,
-             const k2c_tensor* kernel, const k2c_tensor* recurrent_kernel,
-             const k2c_tensor* bias, float * fwork, const int reset_after,
+void k2c_gru(k2c_tensor *output, const k2c_tensor *input, float *state,
+             const k2c_tensor *kernel, const k2c_tensor *recurrent_kernel,
+             const k2c_tensor *bias, float *fwork, const int reset_after,
              const int go_backwards, const int return_sequences,
              k2c_activationType *recurrent_activation,
              k2c_activationType *output_activation);
-
diff --git a/include/k2c_normalization_layers.c b/include/k2c_normalization_layers.c
index 5fefe1d..7da0b02 100644
--- a/include/k2c_normalization_layers.c
+++ b/include/k2c_normalization_layers.c
@@ -33,12 +33,19 @@ void k2c_batch_norm(k2c_tensor* outputs, const k2c_tensor* inputs, const k2c_ten
         offset *= inputs->shape[i];
     }
     const size_t step = inputs->shape[axis];
+    const size_t numel = inputs->numel;
+    const size_t block = step * offset;
 
-    for (size_t i=0; i<inputs->numel; ++i) {
-        size_t idx = (i/offset)%step;
-        outputs->array[i] = (inputs->array[i] - mean->array[idx]) /
-                            stdev->array[idx] *
-                            gamma->array[idx] +
-                            beta->array[idx];
+    // Precompute: out = in * (gamma/stdev) + (beta - mean*gamma/stdev)
+    // Reduces 4 ops per element to 2 (multiply + add), eliminates integer division
+    for (size_t base = 0; base < numel; base += block) {
+        for (size_t idx = 0; idx < step; ++idx) {
+            const float scale = gamma->array[idx] / stdev->array[idx];
+            const float bias = beta->array[idx] - mean->array[idx] * scale;
+            const size_t start = base + idx * offset;
+            for (size_t j = 0; j < offset; ++j) {
+                outputs->array[start + j] = inputs->array[start + j] * scale + bias;
+            }
+        }
     }
 }
diff --git a/include/k2c_pooling_layers.c b/include/k2c_pooling_layers.c
index 4b81164..2daf441 100644
--- a/include/k2c_pooling_layers.c
+++ b/include/k2c_pooling_layers.c
@@ -134,12 +134,14 @@ void k2c_avgpool1d(k2c_tensor* output, const k2c_tensor* input, const size_t poo
         for (size_t j=0, k=0; j<output->numel; j+=channels, k+=stride*channels) {
             int count = 0;
             for (size_t l=0; l<pool_size*channels; l+=channels) {
-                if (input->array[k+i+l] > -HUGE_VALF) {
+                if (input->array[k+i+l] > -3.4e+38f) {
                     output->array[j+i] += input->array[k+i+l];
                     ++count;
                 }
             }
-            output->array[i+j] /= (float)count;
+            if (count > 0) {
+                output->array[i+j] /= (float)count;
+            }
         }
     }
 }
@@ -168,13 +170,15 @@ void k2c_avgpool2d(k2c_tensor* output, const k2c_tensor* input, const size_t * p
                 for (size_t n=0; n<pool_size[1]*channels; n+=channels) {
                     for (size_t p=0; p<pool_size[0]*channels*input->shape[1];
                             p+=channels*input->shape[1]) {
-                        if (-HUGE_VALF < input->array[m+k+i+n+p]) {
+                        if (input->array[m+k+i+n+p] > -3.4e+38f) {
                             output->array[l+j+i] += input->array[m+k+i+n+p];
                             ++count;
                         }
                     }
                 }
-                output->array[l+j+i] /= (float)count;
+                if (count > 0) {
+                    output->array[l+j+i] /= (float)count;
+                }
             }
         }
     }
diff --git a/include/k2c_split_layers.c b/include/k2c_split_layers.c
new file mode 100644
index 0000000..de98380
--- /dev/null
+++ b/include/k2c_split_layers.c
@@ -0,0 +1,11 @@
+ #include <string.h>
+#include "k2c_include.h"
+
+/*
+Split input tensor into one output tensor
+*/
+
+void k2c_split(k2c_tensor *output, k2c_tensor *input, size_t offset)
+{
+    memcpy(&output->array[0], &input->array[offset], output->numel * sizeof(output->array[0]));
+}
diff --git a/include/makefile b/include/makefile
index 82e9cd5..d285ff0 100644
--- a/include/makefile
+++ b/include/makefile
@@ -19,13 +19,15 @@ endif
 OBJ = \
 	k2c_activations.o \
 	k2c_convolution_layers.o \
+	k2c_conv_transpose_layer.o \
 	k2c_core_layers.o \
 	k2c_embedding_layers.o \
 	k2c_helper_functions.o \
 	k2c_merge_layers.o \
 	k2c_normalization_layers.o \
 	k2c_pooling_layers.o \
-	k2c_recurrent_layers.o
+	k2c_recurrent_layers.o \
+	k2c_split_layers.o \
 
 DEPS = \
 	k2c_include.h \
diff --git a/keras2c/check_model.py b/keras2c/check_model.py
index 3845bc8..d7535bf 100644
--- a/keras2c/check_model.py
+++ b/keras2c/check_model.py
@@ -1,7 +1,7 @@
 """check_model.py
 This file is part of keras2c
 Copyright 2020 Rory Conlin
-Licensed under MIT License
+Licensed under LGPLv3 License
 https://github.com/f0uriest/keras2c
 
 Checks a model before conversion to flag unsupported features
@@ -9,11 +9,10 @@
 
 # imports
 import numpy as np
-from keras2c.io_parsing import layer_type, flatten
+from keras2c.io_parsing import layer_type, flatten, get_model_layers
 from keras2c.weights2c import Weights2C
 from keras2c.layer2c import Layers2C
-import tensorflow as tf
-tf.compat.v1.disable_eager_execution()
+
 
 __author__ = "Rory Conlin"
 __copyright__ = "Copyright 2020, Rory Conlin"
@@ -53,8 +52,8 @@ def name_check(model):
 
     valid = True
     log = ''
-    for layer in model.layers:
-        if not is_valid_c_name(layer.name):
+    for layer in get_model_layers(model):
+        if not is_valid_c_name(layer.name.replace('.', '_')):
             valid = False
             log += "layer name '" + layer.name + "' is not a valid C name. \n"
     return valid, log
@@ -86,7 +85,7 @@ def check_layer(layer):
 
     valid = True
     log = ''
-    for layer in model.layers:
+    for layer in get_model_layers(model):
         flag, templog = check_layer(layer)
         valid = valid and flag
         log += templog
@@ -105,8 +104,12 @@ def activation_supported_check(model):
     """
 
     supported_activations = ['linear', 'relu', 'softmax', 'softplus',
-                             'softsign', 'relu', 'tanh', 'sigmoid',
-                             'hard_sigmoid', 'exponential']
+                             'softsign', 'relu', 'tanh', 'sigmoid', 'swish',
+                             'silu', 'hard_sigmoid', 'exponential',
+                             'selu', 'elu', 'gelu', 'hard_silu', 'mish',
+                             'relu6', 'log_softmax', 'leaky_relu', 'celu',
+                             'hard_tanh', 'hard_shrink', 'soft_shrink',
+                             'squareplus', 'sparse_plus']
 
     def check_layer(layer):
         valid = True
@@ -133,7 +136,7 @@ def check_layer(layer):
 
     valid = True
     log = ''
-    for layer in model.layers:
+    for layer in get_model_layers(model):
         flag, templog = check_layer(layer)
         valid = valid and flag
         log += templog
@@ -176,11 +179,19 @@ def check_layer(layer):
                    "' is not supported at this time. \n"
         if config.get('shared_axes'):
             valid = False
-            log += "shared axes option for layer '" + layer.name + \
-                   "' is not supported at this time. \n"
-        if layer_type(layer) in ['Add', 'Subtract', 'Multiply', 'Average',
-                                 'Maximum', 'Minimum']:
-            inshps = layer.input_shape
+            log += (
+                "shared axes option for layer '"
+                + layer.name
+                + "' is not supported at this time. \n"
+            )
+        if layer_type(layer) in ["Add", "Subtract", "Multiply", "Average", "Maximum",
+                                 "Minimum",]:
+            inshps = []
+            # Collect shapes from all inbound nodes
+            for node in getattr(layer, "_inbound_nodes", []):
+                for tensor in node.input_tensors:
+                    inshps.append(tensor.shape)
+
             insize = [np.prod(inp[1:]) for inp in inshps]
             if len(set(insize)) > 1:
                 valid = False
@@ -196,7 +207,7 @@ def check_layer(layer):
 
     valid = True
     log = ''
-    for layer in model.layers:
+    for layer in get_model_layers(model):
         flag, templog = check_layer(layer)
         valid = valid and flag
         log += templog
diff --git a/keras2c/io_parsing.py b/keras2c/io_parsing.py
index c7defb2..62d5624 100644
--- a/keras2c/io_parsing.py
+++ b/keras2c/io_parsing.py
@@ -1,18 +1,22 @@
 """io_parsing.py
 This file is part of keras2c
 Copyright 2020 Rory Conlin
-Licensed under MIT License
+Licensed under LGPLv3 License
 https://github.com/f0uriest/keras2c
 
 Helper functions to get input and output names for each layer etc.
 """
 
-__author__ = "Rory Conlin"
-__copyright__ = "Copyright 2020, Rory Conlin"
-__license__ = "MIT"
-__maintainer__ = "Rory Conlin, https://github.com/f0uriest/keras2c"
-__email__ = "wconlin@princeton.edu"
+# Original author
+# __author__ = "Rory Conlin"
+# __copyright__ = "Copyright 2020, Rory Conlin"
+# __license__ = "MIT"
+# __maintainer__ = "Rory Conlin, https://github.com/f0uriest/keras2c"
+# __email__ = "wconlin@princeton.edu"
 
+# Modified by
+__author__ = "Anchal Gupta"
+__email__ = "guptaa@fusion.gat.com"
 
 def layer_type(layer):
     """Gets the type of a layer
@@ -27,6 +31,78 @@ def layer_type(layer):
     return layer.__class__.__name__
 
 
+def get_model_layers(model):
+    """Gets all layers/operations in the model that need code generation.
+
+    In Keras 3, some operations (like Split) appear in model._operations
+    but not in model.layers. This function returns a combined list.
+
+    Args:
+        model (keras Model): model to parse
+
+    Returns:
+        layers (list): list of all layers/operations
+    """
+    layers = list(model.layers)
+    seen_names = {l.name for l in layers}
+    if hasattr(model, '_operations'):
+        for op in model._operations:
+            if op.name not in seen_names:
+                layers.append(op)
+                seen_names.add(op.name)
+    return layers
+
+
+def get_real_tensor_names(model):
+    """Gets the set of tensor names that are part of the real model graph.
+
+    Traces backward from model outputs through inbound nodes, collecting all
+    tensor names that are reachable. This filters out internal sub-layer
+    tensors (e.g., from Bidirectional's internal forward/backward calls).
+
+    Args:
+        model (keras Model): model to parse
+
+    Returns:
+        real_names (set): set of tensor names in the real model graph
+    """
+    visited = set()
+    queue = []
+    for t in model.outputs:
+        queue.append(t)
+    for t in model.inputs:
+        queue.append(t)
+
+    all_layers = get_model_layers(model)
+
+    while queue:
+        t = queue.pop(0)
+        tname = parse_io_name(t.name)
+        if tname in visited:
+            continue
+        visited.add(tname)
+        for layer in all_layers:
+            for node in getattr(layer, '_inbound_nodes', []):
+                out_t = getattr(node, 'output_tensors', None)
+                if out_t is None:
+                    continue
+                if not isinstance(out_t, (list, tuple)):
+                    out_t = [out_t]
+                matched = False
+                for ot in out_t:
+                    if parse_io_name(ot.name) == tname:
+                        matched = True
+                        break
+                if matched:
+                    inp_t = node.input_tensors
+                    if inp_t is not None:
+                        if not isinstance(inp_t, (list, tuple)):
+                            inp_t = [inp_t]
+                        for it in inp_t:
+                            queue.append(it)
+    return visited
+
+
 def get_all_io_names(model):
     """Gets names of all  node names in the model
 
@@ -37,9 +113,21 @@ def get_all_io_names(model):
         io (list): names of all the nodes in the model
     """
 
-    a = [get_layer_io_names(layer) for layer in model.layers]
+    valid = get_real_tensor_names(model)
+    a = [get_layer_io_names(layer, valid) for layer in get_model_layers(model)]
     return list(set(flatten(a)))
 
+def parse_io_name(name):
+    name = name.replace('.', '_')
+    skip_start = name.find('/')
+    skip_end = name.rfind(':')
+    out_str = name
+    if skip_start != -1:
+        out_str = name[:skip_start]
+    if skip_end != -1:
+        out_str += '_' + name[skip_end+1:]
+    out_str = out_str.replace(':', '_').replace('/', '_')
+    return out_str
 
 def get_layer_num_io(layer):
     """Gets the number of inputs and outputs for a layer
@@ -52,72 +140,84 @@ def get_layer_num_io(layer):
         num_outputs (int): number of output nodes from the layer
     """
 
-    num_inputs = 0
-    error = False
-    while not error:
-        try:
-            layer.get_input_at(num_inputs)
-            num_inputs += 1
-        except ValueError:
-            error = True
-
-    num_outputs = 0
-    error = False
-    while not error:
-        try:
-            layer.get_output_at(num_outputs)
-            num_outputs += 1
-        except ValueError:
-            error = True
+    if hasattr(layer, "inputs"):
+        if isinstance(layer.inputs, list):
+            num_inputs = len(layer.inputs)
+        else:
+            num_inputs = 1
+    else:
+        # fallback: count inbound nodes
+        num_inputs = len(getattr(layer, "_inbound_nodes", []))
+
+    # If outputs attribute is present, count actual tensor outputs
+    if hasattr(layer, "outputs"):
+        outs = layer.outputs
+        if isinstance(outs, list):
+            num_outputs = len(outs)
+        else:
+            num_outputs = 1
+    else:
+        # Fallback: count graph nodes that produce outputs
+        num_outputs = len(getattr(layer, "_inbound_nodes", []))
     return num_inputs, num_outputs
 
 
-def get_layer_io_names(layer):
+def get_layer_io_names(layer, valid_tensors=None):
     """Gets the names of the inputs and outputs of a layer
 
     Args:
         layer (keras Layer): layer you want to parse
+        valid_tensors (set, optional): if provided, only include nodes whose
+            output tensors are in this set. Used to filter out internal
+            sub-layer nodes (e.g., from Bidirectional wrappers).
 
     Returns:
         inputs (list): names of all the input nodes to the layer
         outputs (list): names of all the output nodes from the layer
     """
 
-    num_inputs, num_outputs = get_layer_num_io(layer)
+    num_nodes = len(getattr(layer, "_inbound_nodes", []))
+
     inputs = []
-    # num_inputs>1 -> shared layer
-    for i in range(num_inputs):
+    outputs = []
+
+    for node_index in range(num_nodes):
+        node = layer._inbound_nodes[node_index]
         # is the input a list?
-        if isinstance(layer.get_input_at(i), list):
-            temp_list = []
-            list_length = len(layer.get_input_at(i))
-            for j in range(list_length):
-                name = layer.get_input_at(i)[j].name.split(':')[
-                    0].split('/')[0]
-                temp_list.append(name)
-            inputs.insert(i, temp_list)
+        node_inputs = node.input_tensors
+        if node_inputs is None:
+            node_inp = []
         else:
-            name = layer.get_input_at(i).name.split(':')[0].split('/')[0]
-            inputs.insert(i, name)
+            if isinstance(node_inputs, (list, tuple)):
+                if len(node_inputs) == 1:
+                    node_inp = parse_io_name(node_inputs[0].name)
+                else:
+                    node_inp = [parse_io_name(t.name) for t in node_inputs]
+            else:
+                # single tensor
+                node_inp = parse_io_name(node_inputs.name)
 
-    outputs = []
-    for i in range(num_outputs):
-        # is the output a list?
-        if isinstance(layer.get_output_at(i), list):
-            temp_list = []
-            list_length = len(layer.get_output_at(i))
-            for j in range(list_length):
-                name = layer.get_output_at(i)[j].name.split(':')[
-                    0].split('/')[0]
-                temp_list.append(name)
-            outputs.insert(i, temp_list)
+        node_outputs = getattr(node, "output_tensors", None)
+        if node_outputs is None:
+            node_out = []
         else:
-            name = layer.get_output_at(i).name
-            if 'bidirectional' in name.lower():
-                name = name.split('/')[-2]
+            if isinstance(node_outputs, (list, tuple)):
+                if len(node_outputs) == 1:
+                    node_out = parse_io_name(node_outputs[0].name)
+                else:
+                    node_out = [parse_io_name(t.name) for t in node_outputs]
             else:
-                name = name.split('/')[0]
-            outputs.insert(i, name)
+                node_out = parse_io_name(node_outputs.name)
+
+        # Filter: if valid_tensors provided, only include nodes whose outputs
+        # are in the valid set (filters out internal sub-layer nodes)
+        if valid_tensors is not None:
+            flat_out = flatten([node_out]) if node_out else []
+            if not any(o in valid_tensors for o in flat_out):
+                continue
+
+        inputs.append(node_inp)
+        outputs.append(node_out)
 
     return inputs, outputs
 
@@ -138,10 +238,10 @@ def get_model_io_names(model):
     inputs = []
     outputs = []
     for i in range(num_inputs):
-        nm = model.inputs[i].name.split(':')[0].split('/')[0]
+        nm = parse_io_name(model.inputs[i].name)
         inputs.append(nm)
     for i in range(num_outputs):
-        nm = model.outputs[i].name.split(':')[0].split('/')[0]
+        nm = parse_io_name(model.outputs[i].name)
         outputs.append(nm)
     return inputs, outputs
 
diff --git a/keras2c/keras2c_main.py b/keras2c/keras2c_main.py
index ace0245..775d3b4 100644
--- a/keras2c/keras2c_main.py
+++ b/keras2c/keras2c_main.py
@@ -1,7 +1,7 @@
 """keras2c_main.py
 This file is part of keras2c
 Copyright 2020 Rory Conlin
-Licensed under MIT License
+Licensed under LGPLv3 License
 https://github.com/f0uriest/keras2c
 
 Converts keras model to C code
@@ -11,27 +11,131 @@
 from keras2c.layer2c import Layers2C
 from keras2c.weights2c import Weights2C
 from keras2c.io_parsing import layer_type, get_all_io_names, get_layer_io_names, \
-    get_model_io_names, flatten
+    get_model_io_names, flatten, get_model_layers, get_real_tensor_names
 from keras2c.check_model import check_model
 from keras2c.make_test_suite import make_test_suite
 import numpy as np
 import subprocess
-import tensorflow.keras as keras
-import tensorflow as tf
-tf.compat.v1.disable_eager_execution()
-
-
-__author__ = "Rory Conlin"
-__copyright__ = "Copyright 2020, Rory Conlin"
-__license__ = "MIT"
-__maintainer__ = "Rory Conlin, https://github.com/f0uriest/keras2c"
-__email__ = "wconlin@princeton.edu"
-
-
-def model2c(model, function_name, malloc=False, verbose=True):
+import keras
+
+
+# Original author
+# __author__ = "Rory Conlin"
+# __copyright__ = "Copyright 2020, Rory Conlin"
+# __license__ = "MIT"
+# __maintainer__ = "Rory Conlin, https://github.com/f0uriest/keras2c"
+# __email__ = "wconlin@princeton.edu"
+
+# Modified by
+__author__ = "Anchal Gupta"
+__email__ = "guptaa@fusion.gat.com"
+
+
+def fold_batch_norms(model, verbose=True):
+    """Folds BatchNormalization layers into following Dense/Conv1D layers.
+    Modifies model weights in-place. Returns set of folded layer names."""
+    valid = get_real_tensor_names(model)
+    consumers = {}
+    for layer in get_model_layers(model):
+        inputs, _ = get_layer_io_names(layer, valid)
+        if isinstance(inputs, list):
+            for inp in inputs:
+                if isinstance(inp, list):
+                    for ii in inp:
+                        consumers.setdefault(ii, []).append(layer)
+                elif isinstance(inp, str):
+                    consumers.setdefault(inp, []).append(layer)
+    folded_layers = set()
+    for bn_layer in get_model_layers(model):
+        if layer_type(bn_layer) != 'BatchNormalization':
+            continue
+        _, outputs = get_layer_io_names(bn_layer, valid)
+        out_name = outputs[0] if isinstance(outputs, list) else outputs
+        if isinstance(out_name, list):
+            out_name = out_name[0]
+        next_layers = consumers.get(out_name, [])
+        if len(next_layers) != 1:
+            continue
+        next_layer = next_layers[0]
+        next_type = layer_type(next_layer)
+        if next_type not in ('Dense', 'Conv1D'):
+            continue
+        cfg = bn_layer.get_config()
+        center = cfg['center']
+        scale = cfg['scale']
+        epsilon = cfg['epsilon']
+        if center and scale:
+            gamma, beta, mean, variance = bn_layer.get_weights()
+        elif center:
+            beta, mean, variance = bn_layer.get_weights()
+            gamma = np.ones_like(mean)
+        elif scale:
+            gamma, mean, variance = bn_layer.get_weights()
+            beta = np.zeros_like(mean)
+        else:
+            mean, variance = bn_layer.get_weights()
+            gamma = np.ones_like(mean)
+            beta = np.zeros_like(mean)
+        stdev = np.sqrt(variance + epsilon)
+        bn_scale = gamma / stdev
+        bn_offset = beta - mean * bn_scale
+        if next_type == 'Dense':
+            weights = next_layer.get_weights()
+            W = weights[0]
+            b = weights[1] if len(weights) > 1 else np.zeros(W.shape[1])
+            W_new = W * bn_scale[:, np.newaxis]
+            b_new = bn_offset @ W + b
+            if len(weights) > 1:
+                next_layer.set_weights([W_new, b_new])
+            else:
+                if np.any(np.abs(b_new) > 1e-10):
+                    continue
+                next_layer.set_weights([W_new])
+        elif next_type == 'Conv1D':
+            cfg_conv = next_layer.get_config()
+            padding = cfg_conv.get('padding', 'valid')
+            kernel_size = cfg_conv.get('kernel_size', 1)
+            if isinstance(kernel_size, (list, tuple)):
+                kernel_size = kernel_size[0]
+            # The folded bias adds bn_offset's contribution across the full
+            # kernel window. With 'same'/'causal' padding the conv pads edges
+            # with zeros, so edge outputs would receive too much bn_offset.
+            # Only safe when padding is 'valid', when kernel_size is 1 (no real
+            # padding), or when bn_offset is effectively zero.
+            bn_offset_zero = np.max(np.abs(bn_offset)) < 1e-10
+            if padding != 'valid' and kernel_size != 1 and not bn_offset_zero:
+                continue
+            weights = next_layer.get_weights()
+            K = weights[0]
+            b = weights[1] if len(weights) > 1 else np.zeros(K.shape[-1])
+            K_new = K * bn_scale[np.newaxis, :, np.newaxis]
+            b_new = np.tensordot(bn_offset, K, axes=([0], [1])).sum(axis=0) + b
+            if len(weights) > 1:
+                next_layer.set_weights([K_new, b_new])
+            else:
+                if np.any(np.abs(b_new) > 1e-10):
+                    continue
+                next_layer.set_weights([K_new])
+        identity_var = np.ones_like(variance) * (1.0 - epsilon)
+        if center and scale:
+            bn_layer.set_weights([np.ones_like(gamma), np.zeros_like(beta),
+                                  np.zeros_like(mean), identity_var])
+        elif center:
+            bn_layer.set_weights([np.zeros_like(beta), np.zeros_like(mean), identity_var])
+        elif scale:
+            bn_layer.set_weights([np.ones_like(gamma), np.zeros_like(mean), identity_var])
+        else:
+            bn_layer.set_weights([np.zeros_like(mean), identity_var])
+        folded_layers.add(bn_layer.name)
+        if verbose:
+            print(f'Folded {bn_layer.name} into {next_layer.name}')
+    return folded_layers
+
+
+def model2c(model, function_name, malloc=False, verbose=True, skip_layers=None):
     """Generates C code for model
 
-    Writes main function definition to "function_name.c" and a public header 
+    Writes main function definition to "function_name.c" and a public header
     with declarations to "function_name.h"
 
     Args:
@@ -45,6 +149,9 @@ def model2c(model, function_name, malloc=False, verbose=True):
         stateful (bool): whether the model must maintain state between calls
     """
 
+    if skip_layers is None:
+        skip_layers = set()
+
     model_inputs, model_outputs = get_model_io_names(model)
     includes = '#include <math.h> \n '
     includes += '#include <string.h> \n'
@@ -54,18 +161,18 @@ def model2c(model, function_name, malloc=False, verbose=True):
 
     if verbose:
         print('Gathering Weights')
-    stack_vars, malloc_vars, static_vars = Weights2C(
-        model, function_name, malloc).write_weights(verbose)
+    stack_vars, malloc_vars, static_vars, file_scope_vars = Weights2C(
+        model, function_name, malloc).write_weights(verbose, skip_layers)
     stateful = len(static_vars) > 0
-    layers = Layers2C(model, malloc).write_layers(verbose)
+    layers = Layers2C(model, malloc).write_layers(verbose, skip_layers)
 
     function_signature = 'void ' + function_name + '('
-    function_signature += ', '.join(['k2c_tensor* ' +
+    function_signature += ', '.join(['k2c_tensor* __restrict ' +
                                      in_nm + '_input' for in_nm in model_inputs]) + ', '
-    function_signature += ', '.join(['k2c_tensor* ' +
+    function_signature += ', '.join(['k2c_tensor* __restrict ' +
                                      out_nm + '_output' for out_nm in model_outputs])
     if len(malloc_vars.keys()):
-        function_signature += ',' + ','.join(['float* ' +
+        function_signature += ',' + ','.join(['float* __restrict ' +
                                               key for key in malloc_vars.keys()])
     function_signature += ')'
 
@@ -76,6 +183,7 @@ def model2c(model, function_name, malloc=False, verbose=True):
     with open(function_name + '.c', 'x+') as source:
         source.write(includes)
         source.write(static_vars + '\n\n')
+        source.write(file_scope_vars + '\n\n')
         source.write(function_signature)
         source.write(' { \n\n')
         source.write(stack_vars)
@@ -196,7 +304,7 @@ def k2c(model, function_name, malloc=False, num_tests=10, verbose=True):
         verbose (bool): whether to print progress
 
     Raises:
-        ValueError: if model is not instance of keras.models.Model 
+        ValueError: if model is not instance of keras.models.Model
 
     Returns:
         None
@@ -212,19 +320,28 @@ def k2c(model, function_name, malloc=False, num_tests=10, verbose=True):
                          'either be an instance of keras.models.Model, ' +
                          'or a filepath to a saved .h5 model')
 
+    # Operate on a clone so codegen-time transforms (BN folding) do not mutate
+    # the caller's model. The caller can keep using the original model after k2c.
+    work_model = keras.models.clone_model(model)
+    work_model.set_weights(model.get_weights())
+
     # check that the model can be converted
-    check_model(model, function_name)
+    check_model(work_model, function_name)
     if verbose:
         print('All checks passed')
 
+    folded = fold_batch_norms(work_model, verbose)
+    if verbose and folded:
+        print(f'Folded {len(folded)} batch normalization layers')
+
     malloc_vars, stateful = model2c(
-        model, function_name, malloc, verbose)
+        work_model, function_name, malloc, verbose, skip_layers=folded)
 
     s = 'Done \n'
     s += "C code is in '" + function_name + \
         ".c' with header file '" + function_name + ".h' \n"
     if num_tests > 0:
-        make_test_suite(model, function_name, malloc_vars,
+        make_test_suite(work_model, function_name, malloc_vars,
                         num_tests, stateful, verbose)
         s += "Tests are in '" + function_name + "_test_suite.c' \n"
     if malloc:
diff --git a/keras2c/layer2c.py b/keras2c/layer2c.py
index 79e1985..e50906e 100644
--- a/keras2c/layer2c.py
+++ b/keras2c/layer2c.py
@@ -1,23 +1,36 @@
 """layer2c.py
 This file is part of keras2c
 Copyright 2020 Rory Conlin
-Licensed under MIT License
+Licensed under LGPLv3 License
 https://github.com/f0uriest/keras2c
 
 Writes individual layers to C code
 """
 
 # imports
-from keras2c.io_parsing import layer_type, get_model_io_names, get_all_io_names, get_layer_io_names, flatten
-import tensorflow as tf
-tf.compat.v1.disable_eager_execution()
+from keras2c.io_parsing import layer_type, get_model_io_names, get_all_io_names, get_layer_io_names, flatten, get_model_layers, get_real_tensor_names
 
+# Keras 3 renames some activations; map them back to our C function names
+ACTIVATION_NAME_MAP = {
+    'silu': 'swish',
+}
 
-__author__ = "Rory Conlin"
-__copyright__ = "Copyright 2020, Rory Conlin"
-__license__ = "MIT"
-__maintainer__ = "Rory Conlin, https://github.com/f0uriest/keras2c"
-__email__ = "wconlin@princeton.edu"
+
+def _normalize_activation(name):
+    """Map Keras 3 activation names to keras2c C function names."""
+    return ACTIVATION_NAME_MAP.get(name, name)
+
+
+# Original author
+# __author__ = "Rory Conlin"
+# __copyright__ = "Copyright 2020, Rory Conlin"
+# __license__ = "MIT"
+# __maintainer__ = "Rory Conlin, https://github.com/f0uriest/keras2c"
+# __email__ = "wconlin@princeton.edu"
+
+# Modified by
+__author__ = "Anchal Gupta"
+__email__ = "guptaa@fusion.gat.com"
 
 
 class Layers2C():
@@ -31,72 +44,146 @@ class Layers2C():
     def __init__(self, model, malloc):
         self.model = model
         self.model_inputs, self.model_outputs = get_model_io_names(self.model)
-        self.layers = ''
+        self.layers = ""
         self.malloc = malloc
+        self.valid_tensors = get_real_tensor_names(self.model)
 
-    def write_layers(self, verbose=True):
-        """Writes layers in the correct graph order.
+    def write_layers(self, verbose=True, skip_layers=None):
+        """Writes layers in the correct graph order."""
+        if skip_layers is None:
+            skip_layers = set()
+        self.skip_layers = skip_layers
+        written_io = set(self.model_inputs)
+        all_io = set(flatten(get_all_io_names(self.model)))
+        unwritten_io = all_io - written_io
 
-        Args:
-            verbose (bool): whether to print progress
+        while len(unwritten_io) > 0:
+            progress = False
+
+            for layer in get_model_layers(self.model):
+                layer_inputs, layer_outputs = get_layer_io_names(layer, self.valid_tensors)
+
+                # Check if this layer has multiple nodes (shared layer)
+                if isinstance(layer_inputs, list) and len(layer_inputs) > 0:
+                    # Process each node separately
+                    for i, (inp, outp) in enumerate(zip(layer_inputs, layer_outputs)):
+                        # Handle nested lists from merge layers
+                        if isinstance(inp, list):
+                            flat_inputs = set(flatten([inp]))
+                        else:
+                            flat_inputs = set([inp]) if inp else set()
+
+                        if isinstance(outp, list):
+                            flat_outputs = set(flatten([outp]))
+                        else:
+                            flat_outputs = set([outp]) if outp else set()
+
+                        if flat_inputs.issubset(written_io) and (
+                            flat_outputs & unwritten_io
+                        ):
+                            if verbose:
+                                print(
+                                    f"Writing layer {layer.name} node {i}: {inp} -> {outp}"
+                                )
+
+                            if layer.name in self.skip_layers:
+                                nm, pnm, inp_nm, outp_nm, is_model_in, is_model_out = \
+                                    self._format_io_names(layer, inp, outp, model_io=True)
+                                in_s = inp_nm.lstrip('&')
+                                out_s = outp_nm.lstrip('&')
+                                in_op = '->' if is_model_in else '.'
+                                out_op = '->' if is_model_out else '.'
+                                self.layers += 'memcpy(' + out_s + out_op + 'array,' + \
+                                    in_s + in_op + 'array,' + in_s + in_op + 'numel*sizeof(float)); \n'
+                            else:
+                                # dispatch to the proper layer writer
+                                method = getattr(self, "_write_layer_" + layer_type(layer))
+                                method(
+                                    layer, inp, outp, i
+                                )  # Pass node index as last parameter
+
+                            # mark inputs and outputs as written
+                            written_io |= flat_inputs
+                            written_io |= flat_outputs
+                            unwritten_io -= flat_outputs
+
+                            progress = True
+                else:
+                    # Handle layers with no nodes (shouldn't happen but just in case)
+                    if verbose:
+                        print(f"Warning: Layer {layer.name} has no nodes")
 
-        Returns:
-            layers (str): C code for calling layer functions in correct order
+            if not progress:
+                raise RuntimeError(
+                    "Could not write any layer in this iteration. "
+                    "Check for cycles or disconnected layers in the model."
+                )
 
-        """
-        written_io = set(self.model_inputs)
-        unwritten_io = set(get_all_io_names(self.model)) - written_io
-        while len(unwritten_io) > 0:
-            for layer in self.model.layers:
-                layer_inputs, layer_outputs = get_layer_io_names(layer)
-                for i, (inp, outp) in enumerate(zip(layer_inputs, layer_outputs)):
-                    if (set(flatten(inp)).issubset(written_io) and
-                            set(flatten(outp)).issubset(unwritten_io))or \
-                            layer_type(layer) == 'InputLayer':
-                        if verbose:
-                            print('Writing layer ', outp)
-                        method = getattr(
-                            self, '_write_layer_' + layer_type(layer))
-                        method(layer, inp, outp, i)
-                        written_io |= set(flatten(inp))
-                        written_io |= set(flatten(outp))
-                        unwritten_io -= set(flatten(inp))
-                        unwritten_io -= set(flatten(outp))
         return self.layers
 
     def _format_io_names(self, layer, inp, outp, model_io=False):
-        nm = layer.name
-        pnm = '&' + nm
+        """
+        Normalize and format input/output tensor names for code generation.
+
+        Works with Keras 3 (where inp/outp can be strings, lists, or nested lists).
+
+        Returns:
+            nm, pnm, inp_nm, outp_nm
+            (+ is_model_input, is_model_output if model_io=True)
+        """
+
+        def ensure_list(x):
+            """Wrap strings or non-iterables into a list."""
+            if isinstance(x, str):
+                return [x]
+            if isinstance(x, (list, tuple)):
+                return list(x)
+            return [x]
+
+        def flatten_any(x):
+            """Recursively flatten lists/tuples."""
+            if isinstance(x, (list, tuple)):
+                res = []
+                for xi in x:
+                    res.extend(flatten_any(xi))
+                return res
+            else:
+                return [x]
+
+        nm = layer.name.replace(".", "_")
+        pnm = "&" + nm
+
+        # Normalize inputs / outputs to flat lists of strings
+        in_list = flatten_any(ensure_list(inp))
+        out_list = flatten_any(ensure_list(outp))
+
+        inp_nm = []
+        outp_nm = []
         is_model_input = False
         is_model_output = False
-        if isinstance(inp, list):
-            inp_nm = []
-            for j in inp:
-                if j in self.model_inputs or 'timeslice' in j:
-                    inp_nm.append(j + '_input')
-                    is_model_input = True
-                else:
-                    inp_nm.append('&' + j + '_output')
-        else:
-            if inp in self.model_inputs or 'timeslice' in inp:
-                inp_nm = inp + '_input'
+
+        # --- Format inputs ---
+        for j in in_list:
+            if j in self.model_inputs or "timeslice" in j:
+                inp_nm.append(j + "_input")
                 is_model_input = True
             else:
-                inp_nm = '&' + inp + '_output'
-        if isinstance(outp, list):
-            outp_nm = []
-            for o in outp:
-                if o in self.model_outputs or 'timeslice' in o:
-                    outp_nm.append(o + '_output')
-                    is_model_output = True
-                else:
-                    outp_nm.append('&' + outp + '_output')
-        else:
-            if outp in self.model_outputs or 'timeslice' in outp:
-                outp_nm = outp + '_output'
+                inp_nm.append("&" + j + "_output")
+
+        # --- Format outputs ---
+        for o in out_list:
+            if o in self.model_outputs or "timeslice" in o:
+                outp_nm.append(o + "_output")
                 is_model_output = True
             else:
-                outp_nm = '&' + outp + '_output'
+                outp_nm.append("&" + o + "_output")
+
+        # Collapse back to string if only one element (for consistency with old behavior)
+        if len(inp_nm) == 1:
+            inp_nm = inp_nm[0]
+        if len(outp_nm) == 1:
+            outp_nm = outp_nm[0]
+
         if model_io:
             return nm, pnm, inp_nm, outp_nm, is_model_input, is_model_output
         else:
@@ -126,25 +213,24 @@ def _write_layer_TimeDistributed(self, layer, inputs, outputs, i):
         self.layers += '\n } \n'
 
     def _write_layer_Bidirectional(self, layer, inputs, outputs, i):
-        subname = layer.layer.name
-        method = getattr(self, '_write_layer_' + layer_type(layer.layer))
-        method(layer.forward_layer, inputs,
-               'forward_' + subname, i)
-        method(layer.backward_layer, inputs,
-               'backward_' + subname, i)
+        subname = layer.forward_layer.name
+        method = getattr(self, '_write_layer_' + layer_type(layer.forward_layer))
+        method(layer.forward_layer, inputs, subname, i)
+        subname = layer.backward_layer.name
+        method = getattr(self, '_write_layer_' + layer_type(layer.backward_layer))
+        method(layer.backward_layer, inputs, subname, i)
         mode = layer.merge_mode
-        inputs = ['forward_' + subname,
-                  'backward_' + subname]
-        if layer.layer.return_sequences:
-            self.layers += 'k2c_flip(&backward_' + subname + '_output,0); \n'
+        inputs = [layer.forward_layer.name, layer.backward_layer.name]
+        if layer.return_sequences:
+            self.layers += 'k2c_flip(&' + subname + '_output,0); \n'
         if mode == 'sum':
-            self._write_layer_Merge(layer, inputs, outputs, i, 'Add')
+            self._write_layer_Merge(layer, inputs, outputs, 0, 'Add')
         elif mode == 'mul':
-            self._write_layer_Merge(layer, inputs, outputs, i, 'Multiply')
+            self._write_layer_Merge(layer, inputs, outputs, 0, 'Multiply')
         elif mode == 'ave':
-            self._write_layer_Merge(layer, inputs, outputs, i, 'Average')
+            self._write_layer_Merge(layer, inputs, outputs, 0, 'Average')
         elif mode == 'concat':
-            self._write_layer_Concatenate(layer, inputs, outputs, i)
+            self._write_layer_Concatenate(layer, inputs, outputs, 0)
 
     def _write_layer_LSTM(self, layer, inputs, outputs, i):
         nm, pnm, inputs, outputs = self._format_io_names(
@@ -154,14 +240,14 @@ def _write_layer_LSTM(self, layer, inputs, outputs, i):
                        '_recurrent_kernel,' + pnm + '_bias,' + nm + \
                        '_fwork, \n\t' + nm + '_go_backwards,' + nm + \
                        '_return_sequences, \n\t' + \
-                       'k2c_' + layer.get_config()['recurrent_activation'] + \
+                       'k2c_' + _normalize_activation(layer.get_config()['recurrent_activation']) + \
                        ',' + 'k2c_' + \
-            layer.get_config()['activation'] + '); \n'
+            _normalize_activation(layer.get_config()['activation']) + '); \n'
 
     def _write_layer_Dense(self, layer, inputs, outputs, i):
         nm, pnm, inputs, outputs = self._format_io_names(
             layer, inputs, outputs)
-        activation = 'k2c_' + layer.get_config()['activation']
+        activation = 'k2c_' + _normalize_activation(layer.get_config()['activation'])
 
         self.layers += 'k2c_dense(' + outputs + ',' + inputs + ',' + pnm + \
             '_kernel, \n\t' + pnm + '_bias,' + activation + ',' + \
@@ -170,7 +256,7 @@ def _write_layer_Dense(self, layer, inputs, outputs, i):
     def _write_layer_Conv(self, layer, inputs, outputs, i):
         nm, pnm, inputs, outputs = self._format_io_names(
             layer, inputs, outputs)
-        activation = 'k2c_' + layer.get_config()['activation']
+        activation = 'k2c_' + _normalize_activation(layer.get_config()['activation'])
         if layer_type(layer)[-2:] == '1D':
             fname = 'k2c_conv1d('
         elif layer_type(layer)[-2:] == '2D':
@@ -186,8 +272,7 @@ def _write_layer_Conv(self, layer, inputs, outputs, i):
                                       '_padded_input', i)
             self.layers += fname + outputs + ',' + pnm + \
                 '_padded_input,' + pnm + '_kernel, \n\t' + \
-                pnm + '_bias,' + nm + '_stride,' + nm + \
-                '_dilation,' + activation + '); \n'
+                pnm + '_bias,' + nm + '_stride,' + nm + '_dilation,' + activation + '); \n'
 
     def _write_layer_Conv1D(self, layer, inputs, outputs, i):
         self._write_layer_Conv(layer, inputs, outputs, i)
@@ -198,6 +283,16 @@ def _write_layer_Conv2D(self, layer, inputs, outputs, i):
     def _write_layer_Conv3D(self, layer, inputs, outputs, i):
         self._write_layer_Conv(layer, inputs, outputs, i)
 
+    def _write_layer_Conv1DTranspose(self, layer, inputs, outputs, i):
+        nm, pnm, inputs, outputs = self._format_io_names(
+            layer, inputs, outputs)
+        activation = 'k2c_' + _normalize_activation(layer.get_config()['activation'])
+
+        # Write the conv1d_transpose layer
+        self.layers += 'k2c_conv1d_transpose(' + outputs + ',' + inputs + ',' + \
+            pnm + '_kernel, \n\t' + pnm + '_bias,' + nm + '_stride,' + \
+            nm + '_start_crop,' + activation + '); \n'
+
     def _write_layer_MaxPooling1D(self, layer, inputs, outputs, i):
         self._write_layer_Pooling(layer, inputs, outputs, i)
 
@@ -309,8 +404,8 @@ def _write_layer_GRU(self, layer, inputs, outputs, i):
             pnm + '_recurrent_kernel,' + pnm + '_bias,' + \
             nm + '_fwork, \n\t' + nm + '_reset_after,' + \
             nm + '_go_backwards,' + nm + '_return_sequences, \n\t' + \
-            'k2c_' + layer.get_config()['recurrent_activation'] + \
-            ',' + 'k2c_' + layer.get_config()['activation'] + '); \n'
+            'k2c_' + _normalize_activation(layer.get_config()['recurrent_activation']) + \
+            ',' + 'k2c_' + _normalize_activation(layer.get_config()['activation']) + '); \n'
 
     def _write_layer_SimpleRNN(self, layer, inputs, outputs, i):
         nm, pnm, inputs, outputs = self._format_io_names(
@@ -320,17 +415,28 @@ def _write_layer_SimpleRNN(self, layer, inputs, outputs, i):
             pnm + '_recurrent_kernel,' + pnm + '_bias,' + \
             nm + '_fwork, \n\t' + nm + '_go_backwards,' + \
             nm + '_return_sequences,' + 'k2c_' + \
-            layer.get_config()['activation'] + '); \n'
+            _normalize_activation(layer.get_config()['activation']) + '); \n'
 
     def _write_layer_Activation(self, layer, inputs, outputs, i):
         _, _, inputs, outputs, is_model_input, is_model_output = self._format_io_names(
             layer, inputs, outputs, True)
-        activation = 'k2c_' + layer.get_config()['activation']
+        activation_name = _normalize_activation(layer.get_config()['activation'])
+        activation = 'k2c_' + activation_name
         if is_model_input:
             inp = inputs + '->'
         else:
             inp = inputs[1:] + '.'
-        self.layers += activation + '(' + inp + 'array,' + inp + 'numel); \n'
+        # softmax / log_softmax in Keras operate over the last axis, not the
+        # whole flattened tensor — emit a per-row loop for multi-dim inputs.
+        if activation_name in ('softmax', 'log_softmax'):
+            self.layers += '{ \n'
+            self.layers += 'size_t k2c_last_dim = ' + inp + 'shape[' + inp + 'ndim - 1]; \n'
+            self.layers += 'for (size_t k2c_off = 0; k2c_off < ' + inp + 'numel; k2c_off += k2c_last_dim) { \n'
+            self.layers += activation + '(&' + inp + 'array[k2c_off], k2c_last_dim); \n'
+            self.layers += '} \n'
+            self.layers += '} \n'
+        else:
+            self.layers += activation + '(' + inp + 'array,' + inp + 'numel); \n'
         self._write_dummy_layer(layer, inputs, outputs, i,
                                 is_model_input, is_model_output)
 
@@ -355,7 +461,7 @@ def _write_layer_AdvancedActivation(self, layer, inputs, outputs, i):
         if is_model_input:
             inp = inputs + '->'
         else:
-            inp = inputs + '.'
+            inp = inputs[1:] + '.'
 
         if layer_type(layer) == 'LeakyReLU':
             self.layers += 'k2c_LeakyReLU(' + inp + 'array,' + \
@@ -573,3 +679,30 @@ def _write_layer_Input(self, layer, inputs, outputs, i):
 
     def _write_layer_InputLayer(self, layer, inputs, outputs, i):
         self.layers += ''
+
+    def _write_layer_TFOpLambda(self, layer, inputs, outputs, i):
+        self._write_layer_TensorFlowOpLayer(layer, inputs, outputs, i)
+
+    def _write_layer_TensorFlowOpLayer(self, layer, inputs, outputs, i):
+        if 'split' in layer.name:
+            self._write_split(layer, inputs, outputs, i)
+        else:
+            raise AssertionError('Unsupported TensorFlowOpLayer: ' + layer.name + '\n'
+                                 + 'Currently only split operation is supported.')
+
+    def _write_layer_Split(self, layer, inputs, outputs, i):
+        self._write_split(layer, inputs, outputs, i)
+
+    def _write_split(self, layer, inputs, outputs, i):
+        _, _, inputs, outputs = self._format_io_names(
+            layer, inputs, outputs)
+        node = layer._inbound_nodes[i]
+        out_tensors = getattr(node, 'output_tensors', None)
+        offset = 0
+        if isinstance(outputs, list):
+            for j, outp in enumerate(outputs):
+                self.layers += 'k2c_split(' + outp + ',' + inputs + ',' + str(offset) + '); \n'
+                if out_tensors is not None and isinstance(out_tensors, (list, tuple)):
+                    offset += int(out_tensors[j].shape[-1])
+                else:
+                    offset += layer.get_output_at(i)[j].shape[-1]
diff --git a/keras2c/make_test_suite.py b/keras2c/make_test_suite.py
index 9b6904e..627dd92 100644
--- a/keras2c/make_test_suite.py
+++ b/keras2c/make_test_suite.py
@@ -1,7 +1,7 @@
 """make_test_suite.py
 This file is part of keras2c
 Copyright 2020 Rory Conlin
-Licensed under MIT License
+Licensed under LGPLv3 License
 https://github.com/f0uriest/keras2c
 
 Generates automatic test suite for converted code
@@ -11,15 +11,19 @@
 import numpy as np
 from keras2c.io_parsing import get_model_io_names
 from keras2c.weights2c import Weights2C
-import tensorflow as tf
 import subprocess
-tf.compat.v1.disable_eager_execution()
 
-__author__ = "Rory Conlin"
-__copyright__ = "Copyright 2020, Rory Conlin"
-__license__ = "MIT"
-__maintainer__ = "Rory Conlin, https://github.com/f0uriest/keras2c"
-__email__ = "wconlin@princeton.edu"
+
+# Original author
+# __author__ = "Rory Conlin"
+# __copyright__ = "Copyright 2020, Rory Conlin"
+# __license__ = "MIT"
+# __maintainer__ = "Rory Conlin, https://github.com/f0uriest/keras2c"
+# __email__ = "wconlin@princeton.edu"
+
+# Modified by
+__author__ = "Anchal Gupta"
+__email__ = "guptaa@fusion.gat.com"
 
 
 def make_test_suite(model, function_name, malloc_vars, num_tests=10, stateful=False, verbose=True, tol=1e-5):
@@ -74,8 +78,10 @@ def make_test_suite(model, function_name, malloc_vars, num_tests=10, stateful=Fa
     s += 'struct timeval GetTimeStamp(); \n \n'
     file.write(s)
     for i in range(num_tests):
-        if i == num_tests//2 and stateful:
-            model.reset_states()
+        if i == num_tests // 2 and stateful:
+            for layer in model.layers:
+                if hasattr(layer, 'reset_states'):
+                    layer.reset_states()
         # generate random input and write to file
         ct = 0
         while True:
@@ -87,7 +93,14 @@ def make_test_suite(model, function_name, malloc_vars, num_tests=10, stateful=Fa
                 rand_inputs.insert(j, rand_input)
                 # make predictions
             outputs = model.predict(rand_inputs)
-            if np.isfinite(outputs).all():
+            if isinstance(outputs, list):
+                good = True
+                for outp in outputs:
+                    if not np.isfinite(outp).all():
+                        good = False
+                if good:
+                    break
+            elif np.isfinite(outputs).all():
                 break
             else:
                 ct += 1
@@ -109,7 +122,7 @@ def make_test_suite(model, function_name, malloc_vars, num_tests=10, stateful=Fa
                                          model_outputs[j] + '_test' + str(i+1)))
     s = 'int main(){\n'
     file.write(s)
-    
+
     s = ' float errors[' + str(num_tests*num_outputs) + '];\n'
     s += ' size_t num_tests = ' + str(num_tests) + '; \n'
     s += 'size_t num_outputs = ' + str(num_outputs) + '; \n'
diff --git a/keras2c/weights2c.py b/keras2c/weights2c.py
index a4b3e47..df0f8c2 100644
--- a/keras2c/weights2c.py
+++ b/keras2c/weights2c.py
@@ -1,7 +1,7 @@
 """weights2c.py
 This file is part of keras2c
 Copyright 2020 Rory Conlin
-Licensed under MIT License
+Licensed under LGPLv3 License
 https://github.com/f0uriest/keras2c
 
 Gets weights and other parameters from each layer and writes to C file
@@ -9,18 +9,22 @@
 
 # imports
 import numpy as np
-from keras2c.io_parsing import layer_type, get_layer_io_names, get_model_io_names
-from tensorflow.keras import backend as K
+from keras2c.io_parsing import layer_type, get_layer_io_names, get_model_io_names, get_model_layers, get_real_tensor_names
 import tensorflow as tf
-tf.compat.v1.disable_eager_execution()
+
+
 maxndim = 5
 
+# Original author
+# __author__ = "Rory Conlin"
+# __copyright__ = "Copyright 2020, Rory Conlin"
+# __license__ = "MIT"
+# __maintainer__ = "Rory Conlin, https://github.com/f0uriest/keras2c"
+# __email__ = "wconlin@princeton.edu"
 
-__author__ = "Rory Conlin"
-__copyright__ = "Copyright 2020, Rory Conlin"
-__license__ = "MIT"
-__maintainer__ = "Rory Conlin, https://github.com/f0uriest/keras2c"
-__email__ = "wconlin@princeton.edu"
+# Modified by
+__author__ = "Anchal Gupta"
+__email__ = "guptaa@fusion.gat.com"
 
 
 class Weights2C():
@@ -39,8 +43,10 @@ def __init__(self, model, function_name, malloc=False):
         self.model_io = get_model_io_names(self.model)
         self.malloc = malloc
         self.stack_vars = ''
+        self.file_scope_vars = ''
         self.malloc_vars = {}
         self.static_vars = {}
+        self.valid_tensors = get_real_tensor_names(self.model)
 
     @staticmethod
     def array2c(array, name, malloc=False):
@@ -91,41 +97,99 @@ def array2c(array, name, malloc=False):
                     1:-1] + '}}; \n'
             return s
 
+    @staticmethod
+    def array2c_static(array, name):
+        """Generates C code with static storage for a k2c_tensor.
+
+        Constant weight arrays become static const (in .rodata).
+        Zero-initialized scratch arrays become static (allocated once).
+
+        Returns:
+            tuple of (file_scope_code, function_body_code)
+        """
+        temp = array.flatten(order='C')
+        size = array.size
+        shp = array.shape
+        ndim = len(shp)
+        shp = np.concatenate((shp, np.ones(maxndim-ndim)))
+        shp_str = np.array2string(shp.astype(int), separator=',')[1:-1]
+
+        is_zero = np.max(np.abs(temp)) < 1e-16
+        file_scope = ''
+        stack = ''
+
+        if is_zero:
+            # Mutable scratch buffer — give each calling thread its own copy
+            # so concurrent inference calls don't trample each other.
+            file_scope += 'static K2C_THREAD_LOCAL float ' + name + '_array[' + str(size) + ']; \n'
+            stack += 'k2c_tensor ' + name + ' = {&' + name + \
+                '_array[0],' + str(int(ndim)) + ',' + str(int(size)) + ',{' + \
+                shp_str + '}}; \n'
+        else:
+            count = 0
+            file_scope += 'static const float ' + name + '_array[' + str(size) + '] = {\n'
+            for i in range(size):
+                if temp[i] == np.inf:
+                    file_scope += "HUGE_VALF,"
+                elif temp[i] == -np.inf:
+                    file_scope += "-HUGE_VALF,"
+                else:
+                    file_scope += "{:+.8e}f".format(temp[i]) + ','
+                count += 1
+                if (count) % 5 == 0:
+                    file_scope += '\n'
+            file_scope += '}; \n'
+            file_scope += 'static const k2c_tensor ' + name + ' = {(float*)&' + name + \
+                '_array[0],' + str(int(ndim)) + ',' + str(int(size)) + ',{' + \
+                shp_str + '}}; \n'
+
+        return file_scope, stack
+
     def _write_weights_array2c(self, array, name):
-        temp = self.array2c(array, name, self.malloc)
         if self.malloc:
+            temp = self.array2c(array, name, self.malloc)
             self.stack_vars += temp[0]
             self.malloc_vars.update(temp[1])
         else:
-            self.stack_vars += temp
+            file_scope, stack = self.array2c_static(array, name)
+            self.file_scope_vars += file_scope
+            self.stack_vars += stack
 
-    def _write_weights_layer(self, layer):
+    def _write_weights_layer(self, layer, **kwargs):
         method = getattr(self, '_write_weights_' + layer_type(layer))
-        return method(layer)
+        return method(layer, **kwargs)
 
-    def write_weights(self, verbose=True):
+    def write_weights(self, verbose=True, skip_layers=None):
         """Parses and generates code for model weights and other parameters
 
         Args:
             verbose (bool): whether to print progress
+            skip_layers (set): set of layer names to skip (just write outputs)
 
         Returns:
             (tuple): tuple containing
 
                 - **stack_vars** (*str*): code for variables allocated on the stack
-                - **malloc_vars** (*dict*): dictionary of name,value pairs for arrays to be 
+                - **malloc_vars** (*dict*): dictionary of name,value pairs for arrays to be
                     allocated on the heap
                 - **static_vars** (*str*): code fora C struct containing static variables
                     (eg, states of a stateful RNN)
         """
-        for layer in self.model.layers:
+        if skip_layers is None:
+            skip_layers = set()
+        for layer in get_model_layers(self.model):
+            if layer.name in skip_layers:
+                self._write_outputs(layer)
+                continue
             method = getattr(self, '_write_weights_' + layer_type(layer))
             method(layer)
-        return self.stack_vars, self.malloc_vars, self._write_static_vars()
+        return self.stack_vars, self.malloc_vars, self._write_static_vars(), self.file_scope_vars
 
     def _write_static_vars(self):
         if len(self.static_vars) > 0:
-            s = 'static struct ' + self.function_name + '_static_vars \n'
+            # Stateful RNN cell state is mutable shared storage — make it
+            # thread-local so concurrent callers each get their own state.
+            s = 'static K2C_THREAD_LOCAL struct ' + self.function_name + '_static_vars \n'
             s += '{ \n'
             for k, v in self.static_vars.items():
                 s += 'float ' + k + '[' + str(v) + ']; \n'
@@ -135,13 +199,20 @@ def _write_static_vars(self):
         return s
 
     def _write_outputs(self, layer):
-        _, outputs = get_layer_io_names(layer)
-        if len(outputs) > 1:
+        _, outputs = get_layer_io_names(layer, self.valid_tensors)
+        if isinstance(outputs, list):
             for i, outp in enumerate(outputs):
-                outshp = layer.get_output_at(i).shape[1:]
-                if outp not in self.model_io[1]:
-                    self._write_weights_array2c(
-                        np.zeros(outshp), outp + '_output')
+                if isinstance(outp, list):
+                    for j, outpp in enumerate(outp):
+                        outshp = layer.get_output_at(i)[j].shape[1:]
+                        if outpp not in self.model_io[1]:
+                            self._write_weights_array2c(
+                                np.zeros(outshp), outpp + '_output')
+                else:
+                    outshp = layer.output.shape[1:]
+                    if outp not in self.model_io[1]:
+                        self._write_weights_array2c(
+                            np.zeros(outshp), outp + '_output')
         else:
             outshp = layer.output_shape[1:]
             if outputs[0] not in self.model_io[1]:
@@ -152,18 +223,40 @@ def _write_outputs(self, layer):
 
     def _write_weights_Bidirectional(self, layer):
         try:
-            foo = layer.forward_layer.input_shape
-            foo = layer.backward_layer.input_shape
+            forward_shapes = [
+                t.shape
+                for node in getattr(layer.forward_layer, "_inbound_nodes", [])
+                for t in node.input_tensors
+            ]
+            backward_shapes = [
+                t.shape
+                for node in getattr(layer.backward_layer, "_inbound_nodes", [])
+                for t in node.input_tensors
+            ]
+
+            # If there are shapes, just return them
+            if forward_shapes:
+                forward_output = layer.forward_layer.output
+            else:
+                raise AttributeError
+
+            if backward_shapes:
+                backward_output = layer.backward_layer.output
+            else:
+                raise AttributeError
         except:
-            temp_input = tf.keras.layers.Input(
-                layer.input_shape[2:])
-            foo = layer.layer.__call__(temp_input)
-            foo = layer.forward_layer.__call__(temp_input)
-            foo = layer.backward_layer.__call__(temp_input)
-        self._write_weights_layer(layer.backward_layer)
-        self._write_weights_layer(layer.forward_layer)
-        if layer.merge_mode:
+            temp_input = tf.keras.layers.Input(shape=layer.input.shape[1:])
+            forward_output = layer.forward_layer(temp_input)
+            backward_output = layer.backward_layer(temp_input)
+
+        self._write_weights_layer(layer.backward_layer, skip_outputs=True)
+        outshp = layer.backward_layer.output.shape[1:]
+        self._write_weights_array2c(np.zeros(outshp), layer.backward_layer.name + '_output')
 
+        self._write_weights_layer(layer.forward_layer, skip_outputs=True)
+        outshp = layer.forward_layer.output.shape[1:]
+        self._write_weights_array2c(np.zeros(outshp), layer.forward_layer.name + '_output')
+        if layer.merge_mode:
             self._write_outputs(layer)
             self.stack_vars += 'size_t ' + layer.name + '_num_tensors' + str(0) + \
                 ' = ' + str(2) + '; \n'
@@ -176,36 +269,38 @@ def _write_weights_Bidirectional(self, layer):
                     str(ax) + '; \n'
 
         else:
-            output_names = get_layer_io_names(layer)[1][0]
-            subname = layer.layer.name
+            output_names = get_layer_io_names(layer, self.valid_tensors)[1][0]
+            subname = layer.forward_layer.name
             self.stack_vars += 'k2c_tensor * ' + \
-                output_names[0] + ' = forward_' + subname + '_output; \n'
+                output_names[0] + ' = ' + subname + '_output; \n'
+            subname = layer.backward_layer.name
             self.stack_vars += 'k2c_tensor * ' + \
-                output_names[1] + ' = backward_' + subname + '_output; \n'
+                output_names[1] + ' = ' + subname + '_output; \n'
 
-    def _write_weights_TimeDistributed(self, layer):
-        self._write_outputs(layer)
+    def _write_weights_TimeDistributed(self, layer, skip_outputs=False):
+        if not skip_outputs:
+            self._write_outputs(layer)
         try:
-            foo = layer.layer.input_shape
+            foo = layer.layer.input.shape
         except:
             temp_input = tf.keras.layers.Input(
-                layer.input_shape[2:], batch_size=1)
+                layer.input.shape[2:], batch_size=1)
             foo = layer.layer.__call__(temp_input)
         self._write_weights_layer(layer.layer)
-        timeslice_input = np.squeeze(np.zeros(layer.layer.input_shape[1:]))
-        timeslice_output = np.squeeze(np.zeros(layer.layer.output_shape[1:]))
+        timeslice_input = np.squeeze(np.zeros(layer.layer.input.shape[1:]))
+        timeslice_output = np.squeeze(np.zeros(layer.layer.output.shape[1:]))
         self._write_weights_array2c(
             timeslice_input, layer.layer.name + '_timeslice_input')
         self._write_weights_array2c(
             timeslice_output, layer.layer.name + '_timeslice_output')
         self.stack_vars += 'const size_t ' + layer.name +\
-                           '_timesteps = ' + str(layer.input_shape[1]) + '; \n'
+                           '_timesteps = ' + str(layer.input.shape[1]) + '; \n'
         self.stack_vars += 'const size_t ' + layer.name +\
                            '_in_offset = ' + \
-            str(np.prod(layer.input_shape[2:])) + '; \n'
+            str(np.prod(layer.input.shape[2:])) + '; \n'
         self.stack_vars += 'const size_t ' + layer.name +\
                            '_out_offset = ' + \
-            str(np.prod(layer.output_shape[2:])) + '; \n'
+            str(np.prod(layer.output.shape[2:])) + '; \n'
 
     def _write_weights_Input(self, layer):
         self.stack_vars += ''
@@ -213,15 +308,24 @@ def _write_weights_Input(self, layer):
     def _write_weights_InputLayer(self, layer):
         self.stack_vars += ''
 
-    def _write_weights_BatchNormalization(self, layer):
-        center = layer.get_config()['center']
-        scale = layer.get_config()['scale']
-        if isinstance(layer.get_config()['axis'], (list, tuple, np.ndarray)):
-            axis = layer.get_config()['axis'][0]-1
+    def _write_weights_BatchNormalization(self, layer, skip_outputs=False):
+        cfg = layer.get_config()
+        center = cfg['center']
+        scale = cfg['scale']
+        ndim = len(layer.output.shape)  # includes batch dim
+        if isinstance(cfg['axis'], (list, tuple, np.ndarray)):
+            axis_cfg = cfg['axis'][0]
         else:
-            axis = layer.get_config()['axis']-1
+            axis_cfg = cfg['axis']
+        if isinstance(layer.input, (list, tuple)):
+            ndim = len(layer.input[0].shape)
+        else:
+            ndim = len(layer.input.shape)
+        if axis_cfg < 0:
+            axis_cfg = ndim + axis_cfg
+        axis = axis_cfg - 1
 
-        epsilon = layer.get_config()['epsilon']
+        epsilon = cfg['epsilon']
 
         if center and scale:
             gamma = layer.get_weights()[0]
@@ -245,7 +349,8 @@ def _write_weights_BatchNormalization(self, layer):
             gamma = np.ones(mean.shape)
 
         stdev = np.sqrt(variance + epsilon)
-        self._write_outputs(layer)
+        if not skip_outputs:
+            self._write_outputs(layer)
         self.stack_vars += 'size_t ' + layer.name + \
             '_axis = ' + str(axis) + '; \n'
         self._write_weights_array2c(mean, layer.name + '_mean')
@@ -254,16 +359,18 @@ def _write_weights_BatchNormalization(self, layer):
         self._write_weights_array2c(beta, layer.name + '_beta')
         self.stack_vars += '\n\n'
 
-    def _write_weights_LSTM(self, layer):
-        units = layer.get_config()['units']
-        self._write_outputs(layer)
+    def _write_weights_LSTM(self, layer, skip_outputs=False):
+        cfg = layer.get_config()
+        units = cfg['units']
+        if not skip_outputs:
+            self._write_outputs(layer)
         self.stack_vars += 'float ' + layer.name + \
                            '_fwork[' + str(8*units) + '] = {0}; \n'
         self.stack_vars += 'int ' + layer.name + '_go_backwards = ' + \
-            str(int(layer.get_config()['go_backwards'])) + ';\n'
+            str(int(cfg['go_backwards'])) + ';\n'
         self.stack_vars += 'int ' + layer.name + '_return_sequences = ' + \
-            str(int(layer.get_config()['return_sequences'])) + ';\n'
-        if layer.get_config()['stateful']:
+            str(int(cfg['return_sequences'])) + ';\n'
+        if cfg['stateful']:
             self.static_vars.update({layer.name + '_state': 2*units})
             self.stack_vars += 'float * ' + layer.name + '_state = ' + \
                 self.function_name + '_states.' + \
@@ -275,7 +382,7 @@ def _write_weights_LSTM(self, layer):
         weights = layer.get_weights()
         kernel = weights[0]
         recurrent_kernel = weights[1]
-        if layer.get_config()['use_bias']:
+        if cfg['use_bias']:
             bias = weights[2]
         else:
             bias = np.zeros(4*units)
@@ -288,18 +395,20 @@ def _write_weights_LSTM(self, layer):
         self._write_weights_array2c(bias, layer.name + '_bias')
         self.stack_vars += '\n \n'
 
-    def _write_weights_GRU(self, layer):
-        units = layer.get_config()['units']
-        self._write_outputs(layer)
+    def _write_weights_GRU(self, layer, skip_outputs=False):
+        cfg = layer.get_config()
+        units = cfg['units']
+        if not skip_outputs:
+            self._write_outputs(layer)
         self.stack_vars += 'float ' + layer.name + \
             '_fwork[' + str(6*units) + '] = {0}; \n'
         self.stack_vars += 'int ' + layer.name + '_reset_after = ' + \
-            str(int(layer.get_config()['reset_after'])) + ';\n'
+            str(int(cfg['reset_after'])) + ';\n'
         self.stack_vars += 'int ' + layer.name + '_go_backwards = ' + \
-            str(int(layer.get_config()['go_backwards'])) + ';\n'
+            str(int(cfg['go_backwards'])) + ';\n'
         self.stack_vars += 'int ' + layer.name + '_return_sequences = ' + \
-            str(int(layer.get_config()['return_sequences'])) + ';\n'
-        if layer.get_config()['stateful']:
+            str(int(cfg['return_sequences'])) + ';\n'
+        if cfg['stateful']:
             self.static_vars.update({layer.name + '_state': units})
             self.stack_vars += 'float * ' + layer.name + '_state = ' + \
                 self.function_name + '_states.' + \
@@ -311,9 +420,9 @@ def _write_weights_GRU(self, layer):
         weights = layer.get_weights()
         kernel = weights[0]
         recurrent_kernel = weights[1]
-        if layer.get_config()['use_bias']:
+        if cfg['use_bias']:
             bias = weights[2]
-            if layer.get_config()['reset_after']:
+            if cfg['reset_after']:
                 rbias = bias[1]
                 bias = bias[0]
             else:
@@ -332,16 +441,18 @@ def _write_weights_GRU(self, layer):
         self._write_weights_array2c(cbias, layer.name + '_bias')
         self.stack_vars += '\n \n'
 
-    def _write_weights_SimpleRNN(self, layer):
-        units = layer.get_config()['units']
-        self._write_outputs(layer)
+    def _write_weights_SimpleRNN(self, layer, skip_outputs=False):
+        cfg = layer.get_config()
+        units = cfg['units']
+        if not skip_outputs:
+            self._write_outputs(layer)
         self.stack_vars += 'int ' + layer.name + '_go_backwards = ' + \
-            str(int(layer.get_config()['go_backwards'])) + ';\n'
+            str(int(cfg['go_backwards'])) + ';\n'
         self.stack_vars += 'int ' + layer.name + '_return_sequences = ' + \
-            str(int(layer.get_config()['return_sequences'])) + ';\n'
+            str(int(cfg['return_sequences'])) + ';\n'
         self.stack_vars += 'float ' + layer.name + \
             '_fwork[' + str(2*units) + '] = {0}; \n'
-        if layer.get_config()['stateful']:
+        if cfg['stateful']:
             self.static_vars.update({layer.name + '_state': units})
             self.stack_vars += 'float * ' + layer.name + '_state = ' + \
                 self.function_name + '_states.' + \
@@ -353,7 +464,7 @@ def _write_weights_SimpleRNN(self, layer):
         weights = layer.get_weights()
         kernel = weights[0]
         recurrent_kernel = weights[1]
-        if layer.get_config()['use_bias']:
+        if cfg['use_bias']:
             bias = weights[2]
         else:
             bias = np.zeros(units)
@@ -363,11 +474,13 @@ def _write_weights_SimpleRNN(self, layer):
         self._write_weights_array2c(bias, layer.name + '_bias')
         self.stack_vars += '\n \n'
 
-    def _write_weights_Dense(self, layer):
-        self._write_outputs(layer)
+    def _write_weights_Dense(self, layer, skip_outputs=False):
+        cfg = layer.get_config()
+        if not skip_outputs:
+            self._write_outputs(layer)
         weights = layer.get_weights()
         A = weights[0]
-        if layer.get_config()['use_bias']:
+        if cfg['use_bias']:
             b = weights[1]
         else:
             b = np.zeros(A.shape[1])
@@ -375,21 +488,23 @@ def _write_weights_Dense(self, layer):
         self._write_weights_array2c(A, layer.name + '_kernel')
         self._write_weights_array2c(b, layer.name + '_bias')
         self.stack_vars += 'float ' + layer.name + \
-            '_fwork[' + str(np.prod(layer.input_shape[1:]) +
+            '_fwork[' + str(np.prod(layer.input.shape[1:]) +
                             np.prod(A.shape)) + '] = {0}; \n'
         self.stack_vars += '\n \n'
 
-    def _write_weights_Conv1D(self, layer):
-        padding = layer.get_config()['padding']
-        stride = layer.get_config()['strides'][0]
-        dilation = layer.get_config()['dilation_rate'][0]
-        kernel_size = layer.get_config()['kernel_size'][0]
+    def _write_weights_Conv1D(self, layer, skip_outputs=False):
+        cfg = layer.get_config()
+        padding = cfg['padding']
+        stride = cfg['strides'][0]
+        dilation = cfg['dilation_rate'][0]
+        kernel_size = cfg['kernel_size'][0]
         self.stack_vars += 'size_t ' + layer.name + \
             '_stride = ' + str(stride) + '; \n'
         self.stack_vars += 'size_t ' + layer.name + \
             '_dilation = ' + str(dilation) + '; \n'
-        self._write_outputs(layer)
-        inshp = layer.get_input_at(0).shape[1:]
+        if not skip_outputs:
+            self._write_outputs(layer)
+        inshp = layer.input.shape[1:]
         if padding == 'causal':
             pad_along_height = dilation*(kernel_size-1)
             pad_top = pad_along_height
@@ -407,11 +522,11 @@ def _write_weights_Conv1D(self, layer):
                                         layer.name + '_padded_input')
             self.stack_vars += 'size_t ' + layer.name + '_pad[2] = {' + str(pad_top) + ','\
                 + str(pad_bottom) + '}; \n'
-            self.stack_vars += 'float ' + layer.name + '_fill = 0.0f; \n'
+            self.stack_vars += "float " + layer.name + "_fill = 0.0f; \n"
 
         weights = layer.get_weights()
         kernel = weights[0]
-        if layer.get_config()['use_bias']:
+        if cfg['use_bias']:
             bias = weights[1]
         else:
             bias = np.zeros(kernel.shape[2])
@@ -419,19 +534,21 @@ def _write_weights_Conv1D(self, layer):
         self._write_weights_array2c(bias, layer.name + '_bias')
         self.stack_vars += '\n \n'
 
-    def _write_weights_Conv2D(self, layer):
-        padding = layer.get_config()['padding']
-        stride = layer.get_config()['strides']
-        dilation = layer.get_config()['dilation_rate']
-        kernel_size = layer.get_config()['kernel_size']
+    def _write_weights_Conv2D(self, layer, skip_outputs=False):
+        cfg = layer.get_config()
+        padding = cfg['padding']
+        stride = cfg['strides']
+        dilation = cfg['dilation_rate']
+        kernel_size = cfg['kernel_size']
         self.stack_vars += 'size_t ' + layer.name + \
             '_stride[2] = {' + ','.join([str(i) for i in stride]) + '}; \n'
         self.stack_vars += 'size_t ' + layer.name + \
             '_dilation[2] = {' + ','.join([str(i)
                                            for i in dilation]) + '}; \n'
-        self._write_outputs(layer)
+        if not skip_outputs:
+            self._write_outputs(layer)
         if padding == 'same':
-            inshp = layer.get_input_at(0).shape[1:]
+            inshp = layer.input.shape[1:]
             pad_along_height = dilation[0]*(kernel_size[0]-1)
             pad_top = int(pad_along_height // 2)
             pad_bottom = int(pad_along_height - pad_top)
@@ -449,7 +566,7 @@ def _write_weights_Conv2D(self, layer):
 
         weights = layer.get_weights()
         kernel = weights[0]
-        if layer.get_config()['use_bias']:
+        if cfg['use_bias']:
             bias = weights[1]
         else:
             bias = np.zeros(kernel.shape[3])
@@ -457,19 +574,21 @@ def _write_weights_Conv2D(self, layer):
         self._write_weights_array2c(bias, layer.name + '_bias')
         self.stack_vars += '\n \n'
 
-    def _write_weights_Conv3D(self, layer):
-        padding = layer.get_config()['padding']
-        stride = layer.get_config()['strides']
-        dilation = layer.get_config()['dilation_rate']
-        kernel_size = layer.get_config()['kernel_size']
+    def _write_weights_Conv3D(self, layer, skip_outputs=False):
+        cfg = layer.get_config()
+        padding = cfg['padding']
+        stride = cfg['strides']
+        dilation = cfg['dilation_rate']
+        kernel_size = cfg['kernel_size']
         self.stack_vars += 'size_t ' + layer.name + \
             '_stride[3] = {' + ','.join([str(i) for i in stride]) + '}; \n'
         self.stack_vars += 'size_t ' + layer.name + \
             '_dilation[3] = {' + ','.join([str(i)
                                            for i in dilation]) + '}; \n'
-        self._write_outputs(layer)
+        if not skip_outputs:
+            self._write_outputs(layer)
         if padding == 'same':
-            inshp = layer.get_input_at(0).shape[1:]
+            inshp = layer.input.shape[1:]
             pad_along_height = dilation[0]*(kernel_size[0]-1)
             pad_top = int(pad_along_height // 2)
             pad_bottom = int(pad_along_height - pad_top)
@@ -493,7 +612,7 @@ def _write_weights_Conv3D(self, layer):
 
         weights = layer.get_weights()
         kernel = weights[0]
-        if layer.get_config()['use_bias']:
+        if cfg['use_bias']:
             bias = weights[1]
         else:
             bias = np.zeros(kernel.shape[3])
@@ -501,23 +620,63 @@ def _write_weights_Conv3D(self, layer):
         self._write_weights_array2c(bias, layer.name + '_bias')
         self.stack_vars += '\n \n'
 
-    def _write_weights_MaxPooling1D(self, layer):
-        return self._write_weights_Pooling1D(layer)
+    def _write_weights_Conv1DTranspose(self, layer, skip_outputs=False):
+        cfg = layer.get_config()
+        padding = cfg['padding']
+        stride = cfg['strides'][0]
+        dilation = cfg['dilation_rate'][0]
+        if dilation != 1:
+            raise ValueError('Dilation not supported for Conv1DTranspose')
+        kernel_size = cfg['kernel_size'][0]
 
-    def _write_weights_AveragePooling1D(self, layer):
-        return self._write_weights_Pooling1D(layer)
+        # Write stride to C
+        self.stack_vars += 'size_t ' + layer.name + \
+            '_stride = ' + str(stride) + '; \n'
 
-    def _write_weights_Pooling1D(self, layer):
-        pad = layer.get_config()['padding']
-        stride = layer.get_config()['strides'][0]
-        pool_size = layer.get_config()['pool_size'][0]
+        if padding == 'valid':
+            start_crop = 0
+        elif padding == 'same':
+            start_crop = (kernel_size - stride) // 2
+        else:
+            raise ValueError('Only same and valid padding supported for Conv1DTranspose')
+        # Write start_crop to C
+        self.stack_vars += 'size_t ' + layer.name + \
+            '_start_crop = ' + str(start_crop) + '; \n'
+
+        # Initialize layer.name + '_output'
+        if not skip_outputs:
+            self._write_outputs(layer)
+
+        # Write kernel and bias to C
+        weights = layer.get_weights()
+        kernel = weights[0]
+        if cfg['use_bias']:
+            bias = weights[1]
+        else:
+            bias = np.zeros(kernel.shape[1])
+        self._write_weights_array2c(kernel, layer.name + '_kernel')
+        self._write_weights_array2c(bias, layer.name + '_bias')
+        self.stack_vars += '\n \n'
+
+    def _write_weights_MaxPooling1D(self, layer, **kwargs):
+        return self._write_weights_Pooling1D(layer, **kwargs)
+
+    def _write_weights_AveragePooling1D(self, layer, **kwargs):
+        return self._write_weights_Pooling1D(layer, **kwargs)
+
+    def _write_weights_Pooling1D(self, layer, skip_outputs=False):
+        cfg = layer.get_config()
+        pad = cfg['padding']
+        stride = cfg['strides'][0]
+        pool_size = cfg['pool_size'][0]
         self.stack_vars += 'size_t ' + layer.name + \
             '_stride = ' + str(stride) + '; \n'
         self.stack_vars += 'size_t ' + layer.name + \
             '_pool_size = ' + str(pool_size) + '; \n'
-        self._write_outputs(layer)
-        inshp = layer.get_input_at(0).shape[1:]
-        outshp = layer.get_output_at(0).shape[1:]
+        if not skip_outputs:
+            self._write_outputs(layer)
+        inshp = layer.input.shape[1:]
+        outshp = layer.output.shape[1:]
         if pad == 'same':
             pad_along_height = max((outshp[0] - 1) * stride +
                                    pool_size - inshp[0], 0)
@@ -527,28 +686,30 @@ def _write_weights_Pooling1D(self, layer):
                                         layer.name + '_padded_input')
             self.stack_vars += 'size_t ' + layer.name + '_pad[2] = {' + str(pad_top) + ','\
                 + str(pad_bottom) + '}; \n'
-            self.stack_vars += 'float ' + layer.name + '_fill = -HUGE_VALF; \n'
+            self.stack_vars += 'float ' + layer.name + '_fill = -3.4e+38f; \n'
         self.stack_vars += '\n\n'
 
-    def _write_weights_MaxPooling2D(self, layer):
-        return self._write_weights_Pooling2D(layer)
+    def _write_weights_MaxPooling2D(self, layer, **kwargs):
+        return self._write_weights_Pooling2D(layer, **kwargs)
 
-    def _write_weights_AveragePooling2D(self, layer):
-        return self._write_weights_Pooling2D(layer)
+    def _write_weights_AveragePooling2D(self, layer, **kwargs):
+        return self._write_weights_Pooling2D(layer, **kwargs)
 
-    def _write_weights_Pooling2D(self, layer):
-        padding = layer.get_config()['padding']
-        stride = layer.get_config()['strides']
-        pool_size = layer.get_config()['pool_size']
+    def _write_weights_Pooling2D(self, layer, skip_outputs=False):
+        cfg = layer.get_config()
+        padding = cfg['padding']
+        stride = cfg['strides']
+        pool_size = cfg['pool_size']
         self.stack_vars += 'size_t ' + layer.name + \
             '_stride[2] = {' + ','.join([str(i) for i in stride]) + '}; \n'
         self.stack_vars += 'size_t ' + layer.name + \
             '_pool_size[2] = {' + ','.join([str(i)
                                             for i in pool_size]) + '}; \n'
-        self._write_outputs(layer)
+        if not skip_outputs:
+            self._write_outputs(layer)
         if padding == 'same':
-            inshp = layer.get_input_at(0).shape[1:]
-            outshp = layer.get_output_at(0).shape[1:]
+            inshp = layer.input.shape[1:]
+            outshp = layer.output.shape[1:]
             pad_along_height = max((outshp[0] - 1) * stride[0] +
                                    pool_size[0] - inshp[0], 0)
             pad_top = int(pad_along_height // 2)
@@ -564,68 +725,71 @@ def _write_weights_Pooling2D(self, layer):
                                         '_padded_input')
             self.stack_vars += 'size_t ' + layer.name + \
                 '_pad[4] = {' + ','.join([str(i) for i in pad]) + '}; \n'
-            self.stack_vars += 'float ' + layer.name + '_fill = -HUGE_VALF; \n'
+            self.stack_vars += 'float ' + layer.name + '_fill = -3.4e+38f; \n'
         self.stack_vars += '\n\n'
 
-    def _write_weights_GlobalMaxPooling1D(self, layer):
-        return self._write_weights_GlobalPooling(layer)
+    def _write_weights_GlobalMaxPooling1D(self, layer, **kwargs):
+        return self._write_weights_GlobalPooling(layer, **kwargs)
 
-    def _write_weights_GlobalMaxPooling2D(self, layer):
-        return self._write_weights_GlobalPooling(layer)
+    def _write_weights_GlobalMaxPooling2D(self, layer, **kwargs):
+        return self._write_weights_GlobalPooling(layer, **kwargs)
 
-    def _write_weights_GlobalMaxPooling3D(self, layer):
-        return self._write_weights_GlobalPooling(layer)
+    def _write_weights_GlobalMaxPooling3D(self, layer, **kwargs):
+        return self._write_weights_GlobalPooling(layer, **kwargs)
 
-    def _write_weights_GlobalAveragePooling1D(self, layer):
-        return self._write_weights_GlobalPooling(layer)
+    def _write_weights_GlobalAveragePooling1D(self, layer, **kwargs):
+        return self._write_weights_GlobalPooling(layer, **kwargs)
 
-    def _write_weights_GlobalAveragePooling2D(self, layer):
-        return self._write_weights_GlobalPooling(layer)
+    def _write_weights_GlobalAveragePooling2D(self, layer, **kwargs):
+        return self._write_weights_GlobalPooling(layer, **kwargs)
 
-    def _write_weights_GlobalAveragePooling3D(self, layer):
-        return self._write_weights_GlobalPooling(layer)
+    def _write_weights_GlobalAveragePooling3D(self, layer, **kwargs):
+        return self._write_weights_GlobalPooling(layer, **kwargs)
 
-    def _write_weights_GlobalPooling(self, layer):
-        self._write_outputs(layer)
+    def _write_weights_GlobalPooling(self, layer, skip_outputs=False):
+        if not skip_outputs:
+            self._write_outputs(layer)
         self.stack_vars += '\n\n'
 
-    def _write_weights_Add(self, layer):
-        return self._write_weights_Merge(layer)
+    def _write_weights_Add(self, layer, **kwargs):
+        return self._write_weights_Merge(layer, **kwargs)
 
-    def _write_weights_Subtract(self, layer):
-        return self._write_weights_Merge(layer)
+    def _write_weights_Subtract(self, layer, **kwargs):
+        return self._write_weights_Merge(layer, **kwargs)
 
-    def _write_weights_Multiply(self, layer):
-        return self._write_weights_Merge(layer)
+    def _write_weights_Multiply(self, layer, **kwargs):
+        return self._write_weights_Merge(layer, **kwargs)
 
-    def _write_weights_Average(self, layer):
-        return self._write_weights_Merge(layer)
+    def _write_weights_Average(self, layer, **kwargs):
+        return self._write_weights_Merge(layer, **kwargs)
 
-    def _write_weights_Maximum(self, layer):
-        return self._write_weights_Merge(layer)
+    def _write_weights_Maximum(self, layer, **kwargs):
+        return self._write_weights_Merge(layer, **kwargs)
 
-    def _write_weights_Minimum(self, layer):
-        return self._write_weights_Merge(layer)
+    def _write_weights_Minimum(self, layer, **kwargs):
+        return self._write_weights_Merge(layer, **kwargs)
 
-    def _write_weights_Merge(self, layer):
-        self._write_outputs(layer)
-        inputs, outputs = get_layer_io_names(layer)
+    def _write_weights_Merge(self, layer, skip_outputs=False):
+        if not skip_outputs:
+            self._write_outputs(layer)
+        inputs, outputs = get_layer_io_names(layer, self.valid_tensors)
         for i, (inp, outp) in enumerate(zip(inputs, outputs)):
             num_tensors = len(inp)
             self.stack_vars += 'size_t ' + layer.name + '_num_tensors' + str(i) + \
                 ' = ' + str(num_tensors) + '; \n'
         self.stack_vars += '\n\n'
 
-    def _write_weights_Concatenate(self, layer):
-        inputs, outputs = get_layer_io_names(layer)
+    def _write_weights_Concatenate(self, layer, ):
+        cfg = layer.get_config()
+        inputs, outputs = get_layer_io_names(layer, self.valid_tensors)
         for i, (inp, outp) in enumerate(zip(inputs, outputs)):
-            outshp = layer.get_output_at(i).shape[1:]
+            outshp = layer.output.shape[1:]
             num_tensors = len(inp)
             self.stack_vars += 'size_t ' + layer.name + '_num_tensors' + str(i) + \
                 ' = ' + str(num_tensors) + '; \n'
-            ax = layer.get_config()['axis']
+            ax = cfg['axis']
             if ax < 0:
-                ax += len(layer.get_input_at(i)[0].shape)
+                ax += len(layer.input[0].shape)
             self.stack_vars += 'size_t ' + layer.name + '_axis = ' +\
                 str(ax-1) + '; \n'
         if outp not in self.model_io[1]:
@@ -634,27 +798,34 @@ def _write_weights_Concatenate(self, layer):
         self.stack_vars += '\n\n'
 
     def _write_weights_ELU(self, layer):
-        alpha = layer.get_config()['alpha']
+        cfg = layer.get_config()
+        alpha = cfg['alpha']
         self.stack_vars += 'float ' + layer.name + \
             '_alpha = ' + str(alpha) + '; \n'
         self.stack_vars += '\n\n'
 
     def _write_weights_LeakyReLU(self, layer):
-        alpha = layer.get_config()['alpha']
+        cfg = layer.get_config()
+        try:
+            alpha = cfg['alpha']
+        except KeyError:
+            alpha = cfg['negative_slope']
         self.stack_vars += 'float ' + layer.name + \
             '_alpha = ' + str(alpha) + '; \n'
         self.stack_vars += '\n\n'
 
     def _write_weights_ThresholdedReLU(self, layer):
-        theta = layer.get_config()['theta']
+        cfg = layer.get_config()
+        theta = cfg['theta']
         self.stack_vars = 'float ' + layer.name + \
             '_theta = ' + str(theta) + '; \n'
         self.stack_vars += '\n\n'
 
     def _write_weights_ReLU(self, layer):
-        max_value = layer.get_config()['max_value']
-        negative_slope = layer.get_config()['negative_slope']
-        threshold = layer.get_config()['threshold']
+        cfg = layer.get_config()
+        max_value = cfg['max_value']
+        negative_slope = cfg['negative_slope']
+        threshold = cfg['threshold']
         if max_value is None:
             max_value = 'HUGE_VALF'
         self.stack_vars += 'float ' + layer.name + \
@@ -670,10 +841,12 @@ def _write_weights_PReLU(self, layer):
             layer.get_weights()[0], layer.name + '_alpha')
         self.stack_vars += '\n\n'
 
-    def _write_weights_Reshape(self, layer):
+    def _write_weights_Reshape(self, layer, skip_outputs=False):
+        cfg = layer.get_config()
         nm = layer.name
-        self._write_outputs(layer)
-        newshp = layer.get_config()['target_shape']
+        if not skip_outputs:
+            self._write_outputs(layer)
+        newshp = cfg['target_shape']
         newndim = len(newshp)
         newshp = np.concatenate((newshp, np.ones(maxndim-newndim)))
         self.stack_vars += 'size_t ' + nm + \
@@ -683,26 +856,32 @@ def _write_weights_Reshape(self, layer):
                                 separator=',')[1:-1]) + '}; \n'
         self.stack_vars += '\n\n'
 
-    def _write_weights_Permute(self, layer):
-        self._write_outputs(layer)
-        permute = np.array(layer.get_config()['dims']).astype(int) - 1
+    def _write_weights_Permute(self, layer, skip_outputs=False):
+        cfg = layer.get_config()
+        if not skip_outputs:
+            self._write_outputs(layer)
+        permute = np.array(cfg['dims']).astype(int) - 1
         self.stack_vars += 'size_t ' + layer.name + '_permute[' + str(permute.size) + '] = {' +\
             str(np.array2string(permute.astype(int),
                                 separator=',')[1:-1]) + '}; \n'
         self.stack_vars += '\n\n'
 
-    def _write_weights_RepeatVector(self, layer):
-        self._write_outputs(layer)
-        n = layer.get_config()['n']
+    def _write_weights_RepeatVector(self, layer, skip_outputs=False):
+        cfg = layer.get_config()
+        if not skip_outputs:
+            self._write_outputs(layer)
+        n = cfg['n']
         self.stack_vars += 'size_t ' + layer.name + '_n = ' + str(n) + '; \n'
         self.stack_vars += '\n\n'
 
-    def _write_weights_Dot(self, layer):
+    def _write_weights_Dot(self, layer, skip_outputs=False):
+        cfg = layer.get_config()
         nm = layer.name
-        self._write_outputs(layer)
+        if not skip_outputs:
+            self._write_outputs(layer)
         work_size = np.prod(layer.input[0].shape[1:]) + \
             np.prod(layer.input[1].shape[1:])
-        axes = np.array(layer.get_config()['axes']) - 1
+        axes = np.array(cfg['axes']) - 1
         self.stack_vars += 'size_t ' + nm + \
             '_axesA[1] = {' + str(axes[0]) + '}; \n'
         self.stack_vars += 'size_t ' + nm + \
@@ -711,106 +890,125 @@ def _write_weights_Dot(self, layer):
         self.stack_vars += 'float ' + nm + \
             '_fwork[' + str(work_size) + '] = {0}; \n'
         self.stack_vars += 'int ' + nm + '_normalize = ' + \
-            str(int(layer.get_config()['normalize'])) + '; \n'
+            str(int(cfg['normalize'])) + '; \n'
         self.stack_vars += '\n\n'
 
-    def _write_weights_Embedding(self, layer):
+    def _write_weights_Embedding(self, layer, skip_outputs=False):
         nm = layer.name
-        self._write_outputs(layer)
+        if not skip_outputs:
+            self._write_outputs(layer)
         kernel = layer.get_weights()[0]
         self._write_weights_array2c(kernel, nm+'_kernel')
         self.stack_vars += '\n\n'
 
-    def _write_weights_UpSampling1D(self, layer):
+    def _write_weights_UpSampling1D(self, layer, skip_outputs=False):
+        cfg = layer.get_config()
         nm = layer.name
-        self._write_outputs(layer)
-        size = layer.get_config()['size']
+        if not skip_outputs:
+            self._write_outputs(layer)
+        size = cfg['size']
         self.stack_vars += 'size_t ' + nm + '_size = ' + str(size) + '; \n'
         self.stack_vars += '\n\n'
 
-    def _write_weights_UpSampling2D(self, layer):
+    def _write_weights_UpSampling2D(self, layer, skip_outputs=False):
+        cfg = layer.get_config()
         nm = layer.name
-        self._write_outputs(layer)
-        size = layer.get_config()['size']
+        if not skip_outputs:
+            self._write_outputs(layer)
+        size = cfg['size']
         self.stack_vars += 'size_t ' + nm + '_size[2] = {' + str(size[0]) + \
             ',' + str(size[1]) + '}; \n'
         self.stack_vars += '\n\n'
 
-    def _write_weights_UpSampling3D(self, layer):
+    def _write_weights_UpSampling3D(self, layer, skip_outputs=False):
+        cfg = layer.get_config()
         nm = layer.name
-        self._write_outputs(layer)
-        size = layer.get_config()['size']
+        if not skip_outputs:
+            self._write_outputs(layer)
+        size = cfg['size']
         self.stack_vars += 'size_t ' + nm + '_size[3] = {' + str(size[0]) + \
             ',' + str(size[1]) + ',' + str(size[2]) + '}; \n'
         self.stack_vars += '\n\n'
 
-    def _write_weights_Cropping1D(self, layer):
+    def _write_weights_Cropping1D(self, layer, skip_outputs=False):
         nm = layer.name
-        self._write_outputs(layer)
-        crop_top = layer.get_config()['cropping'][0]
-        crop_bottom = layer.get_config()['cropping'][1]
+        if not skip_outputs:
+            self._write_outputs(layer)
+        cfg = layer.get_config()
+        crop_top = cfg['cropping'][0]
+        crop_bottom = cfg['cropping'][1]
         self.stack_vars += 'size_t ' + nm + '_crop[2] = {' + str(crop_top) + ','\
             + str(crop_bottom) + '}; \n'
         self.stack_vars += '\n\n'
 
-    def _write_weights_Cropping2D(self, layer):
+    def _write_weights_Cropping2D(self, layer, skip_outputs=False):
         nm = layer.name
-        self._write_outputs(layer)
-        crop_top = layer.get_config()['cropping'][0][0]
-        crop_bottom = layer.get_config()['cropping'][0][1]
-        crop_left = layer.get_config()['cropping'][1][0]
-        crop_right = layer.get_config()['cropping'][1][1]
+        if not skip_outputs:
+            self._write_outputs(layer)
+        cfg = layer.get_config()
+        crop_top = cfg['cropping'][0][0]
+        crop_bottom = cfg['cropping'][0][1]
+        crop_left = cfg['cropping'][1][0]
+        crop_right = cfg['cropping'][1][1]
         self.stack_vars += 'size_t ' + nm + '_crop[4] = {' + str(crop_top) + ','\
             + str(crop_bottom) + ',' + str(crop_left) + \
             ',' + str(crop_right) + '}; \n'
         self.stack_vars += '\n\n'
 
-    def _write_weights_Cropping3D(self, layer):
+    def _write_weights_Cropping3D(self, layer, skip_outputs=False):
         nm = layer.name
-        self._write_outputs(layer)
-        crop0 = layer.get_config()['cropping'][0][0]
-        crop1 = layer.get_config()['cropping'][0][1]
-        crop2 = layer.get_config()['cropping'][1][0]
-        crop3 = layer.get_config()['cropping'][1][1]
-        crop4 = layer.get_config()['cropping'][2][0]
-        crop5 = layer.get_config()['cropping'][2][1]
+        if not skip_outputs:
+            self._write_outputs(layer)
+        cfg = layer.get_config()
+        crop0 = cfg['cropping'][0][0]
+        crop1 = cfg['cropping'][0][1]
+        crop2 = cfg['cropping'][1][0]
+        crop3 = cfg['cropping'][1][1]
+        crop4 = cfg['cropping'][2][0]
+        crop5 = cfg['cropping'][2][1]
         self.stack_vars += 'size_t ' + nm + '_crop[6] = {' + str(crop0) + ','\
             + str(crop1) + ',' + str(crop2) + ',' + str(crop3) + \
             ',' + str(crop4) + ',' + str(crop5) + '}; \n'
         self.stack_vars += '\n\n'
 
-    def _write_weights_ZeroPadding1D(self, layer):
+    def _write_weights_ZeroPadding1D(self, layer, skip_outputs=False):
         nm = layer.name
-        self._write_outputs(layer)
-        pad_top = layer.get_config()['padding'][0]
-        pad_bottom = layer.get_config()['padding'][1]
+        if not skip_outputs:
+            self._write_outputs(layer)
+        cfg = layer.get_config()
+        pad_top = cfg['padding'][0]
+        pad_bottom = cfg['padding'][1]
         self.stack_vars += 'size_t ' + nm + '_pad[2] = {' + str(pad_top) + ','\
             + str(pad_bottom) + '}; \n'
         self.stack_vars += 'float ' + nm + '_fill = 0.0f; \n'
         self.stack_vars += '\n\n'
 
-    def _write_weights_ZeroPadding2D(self, layer):
+    def _write_weights_ZeroPadding2D(self, layer, skip_outputs=False):
         nm = layer.name
-        self._write_outputs(layer)
-        pad_top = layer.get_config()['padding'][0][0]
-        pad_bottom = layer.get_config()['padding'][0][1]
-        pad_left = layer.get_config()['padding'][1][0]
-        pad_right = layer.get_config()['padding'][1][1]
+        if not skip_outputs:
+            self._write_outputs(layer)
+        cfg = layer.get_config()
+        pad_top = cfg['padding'][0][0]
+        pad_bottom = cfg['padding'][0][1]
+        pad_left = cfg['padding'][1][0]
+        pad_right = cfg['padding'][1][1]
         self.stack_vars += 'size_t ' + nm + '_pad[4] = {' + str(pad_top) + ','\
             + str(pad_bottom) + ',' + str(pad_left) + \
             ',' + str(pad_right) + '}; \n'
         self.stack_vars += 'float ' + nm + '_fill = 0.0f; \n'
         self.stack_vars += '\n\n'
 
-    def _write_weights_ZeroPadding3D(self, layer):
+    def _write_weights_ZeroPadding3D(self, layer, skip_outputs=False):
         nm = layer.name
-        self._write_outputs(layer)
-        pad0 = layer.get_config()['padding'][0][0]
-        pad1 = layer.get_config()['padding'][0][1]
-        pad2 = layer.get_config()['padding'][1][0]
-        pad3 = layer.get_config()['padding'][1][1]
-        pad4 = layer.get_config()['padding'][2][0]
-        pad5 = layer.get_config()['padding'][2][1]
+        if not skip_outputs:
+            self._write_outputs(layer)
+        cfg = layer.get_config()
+        pad0 = cfg['padding'][0][0]
+        pad1 = cfg['padding'][0][1]
+        pad2 = cfg['padding'][1][0]
+        pad3 = cfg['padding'][1][1]
+        pad4 = cfg['padding'][2][0]
+        pad5 = cfg['padding'][2][1]
         self.stack_vars += 'size_t ' + nm + '_pad[6] = {' + str(pad0) + ','\
             + str(pad1) + ',' + str(pad2) + ',' + str(pad3) + \
             ',' + str(pad4) + ',' + str(pad5) + '}; \n'
@@ -834,9 +1032,9 @@ def _write_weights_SpatialDropout3D(self, layer):
         pass
 
     def _write_weights_Flatten(self, layer):
-        _, outputs = get_layer_io_names(layer)
+        _, outputs = get_layer_io_names(layer, self.valid_tensors)
         for i, outp in enumerate(outputs):
-            inshp = layer.get_input_at(i).shape[1:]
+            inshp = layer.input.shape[1:]
             if outp not in self.model_io[1]:
                 self._write_weights_array2c(
                     np.zeros(inshp).flatten(), outp + '_output')
@@ -848,3 +1046,34 @@ def _write_weights_Activation(self, layer):
     def _write_weights_Dropout(self, layer):
         # no weights needed
         pass
+
+    def _write_weights_TFOpLambda(self, layer):
+        self._write_weights_TensorFlowOpLayer(layer)
+
+    def _write_weights_TensorFlowOpLayer(self, layer, skip_outputs=False):
+        # Special case when tf.split is used
+        # no weights needed
+        if 'split' in layer.name:
+            if not skip_outputs:
+                self._write_outputs(layer)
+        else:
+            raise AssertionError('Unsupported TensorFlowOpLayer: ' + layer.name + '\n'
+                                 + 'Currently only split operation is supported.')
+
+    def _write_weights_Split(self, layer, skip_outputs=False):
+        # Split layer (Keras 3 keras.ops.split) - no weights needed
+        if not skip_outputs:
+            _, outputs = get_layer_io_names(layer, self.valid_tensors)
+            if isinstance(outputs, list):
+                for i, outp in enumerate(outputs):
+                    if isinstance(outp, list):
+                        # Multi-output node: get shapes from inbound node tensors
+                        node = layer._inbound_nodes[i]
+                        out_tensors = getattr(node, 'output_tensors', [])
+                        if not isinstance(out_tensors, (list, tuple)):
+                            out_tensors = [out_tensors]
+                        for j, outpp in enumerate(outp):
+                            outshp = out_tensors[j].shape[1:]
+                            if outpp not in self.model_io[1]:
+                                self._write_weights_array2c(
+                                    np.zeros(outshp), outpp + '_output')
diff --git a/requirements.txt b/requirements.txt
index 07d79fa..54e4728 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,2 @@
-# This is an implicit value, here for clarity
---index-url https://pypi.python.org/simple/
-
-numpy >= 1.13.0
-tensorflow >= 2.0
-
+tensorflow
+keras >= 3.0.0
diff --git a/tests/numpy_conv1d.py b/tests/numpy_conv1d.py
new file mode 100644
index 0000000..0a2e8a9
--- /dev/null
+++ b/tests/numpy_conv1d.py
@@ -0,0 +1,587 @@
+"""
+Numpy implementation of 1D convolution and transposed convolution of keras.Conv1D
+and keras.Conv1DTranspose
+
+The idea is to obtain the same output as keras.Conv1D and keras.Conv1DTranspose in
+simple numpy code so that it can be implemented in C code.
+
+Inspired by the following code:
+https://github.com/rmwkwok/transposed_convolution_in_numpy/tree/main
+"""
+
+import numpy as np
+import keras
+import traceback
+import time
+
+
+def out_shape(input_shape: tuple[int, int],
+              kernel: np.ndarray,
+              strides: int = 1,
+              padding: str = "valid",
+              mode: str = "normal"
+              ) -> tuple[int, int]:
+    """
+    Compute the output shape of the convolution operation.
+
+    Parameters
+    ----------
+    input_shape : tuple[int, int]
+        Shape of the input tensor without batch size. (height, channels)
+    kernel: numpy.ndarray
+        Kernel of the convolution operation.
+        Returned by model.layer[i].get_weights()[0] for a keras model.
+    strides : int
+        Strides of the convolution operation
+    padding : str
+        Padding of the convolution operation. 'valid' or 'same' are supported.
+    mode : str
+    Mode of the convolution operation.
+    'normal' means normal convolution and 'transposed' means transposed convolution.
+
+    Returns
+    -------
+    Tuple[int, int]
+    Shape of the output tensor without batch size.
+    (no of convolutions, no of filters)
+    """
+    ih, ic = input_shape
+    if mode == "normal":
+        ksize, kc, n_filters = kernel.shape
+        assert ic == kc
+        if padding == "valid":
+            p_2 = 0
+        elif padding == "same":
+            p_2 = ksize - 1
+        nconv = (ih + p_2 - ksize) // strides + 1
+    elif mode == "transposed":
+        ksize, n_filters, kc = kernel.shape
+        assert ic == kc
+        if padding == "valid":
+            p_2 = 0
+        elif padding == "same":
+            p_2 = ksize - strides
+        nconv = ih - p_2 + ksize - 1 + ((ih - 1) * (strides - 1))
+        # if padding == 'valid':
+        #     nconv = ih + ksize - 1 + ((ih - 1) * (strides - 1))
+        # elif padding == 'same':
+        #     nconv = ih + (ih * (strides - 1))
+    return nconv, n_filters
+
+
+def padding1d(inputs : np.ndarray,
+              kernel: np.ndarray,
+              strides: int = 1,
+              padding: str = "valid",
+              mode: str = "normal"
+              ) -> np.ndarray:
+    """
+    Compute the padded input tensor for the convolution operation and crops the output
+    tensor for the transposed convolution operation.
+
+    Parameters
+    ----------
+    inputs : numpy.ndarray
+        Input tensor with batch size. (batch size, height, channels)
+    kernel : numpy.ndarray
+        Kernel of the convolution operation.
+        Returned by model.layer[i].get_weights()[0] for a keras model.
+    strides : int
+        Strides of the convolution operation
+    padding : str
+        Padding of the convolution operation. 'valid' or 'same' are supported.
+    mode : str
+        Mode of the convolution operation.
+        'normal' means normal convolution and 'transposed' means transposed convolution.
+
+    Returns
+    -------
+    numpy.ndarray
+    Padded input tensor with batch size.
+    (batch size, height, channels) or cropped output tensor with batch size.
+    (batch size, convolutions, filters)
+    """
+    ksize = kernel.shape[0]
+    if padding == "valid":
+        return inputs
+    elif padding == "same":
+        if mode == "normal":
+            p_2 = ksize - 1
+        elif mode == "transposed":
+            p_2 = ksize - strides
+    if p_2 % 2 == 0:
+        before = after = p_2 // 2
+    else:
+        before = (p_2 - 1) // 2
+        after = before + 1
+    if mode == "normal":
+        outputs = np.zeros((inputs.shape[0], inputs.shape[1] + p_2, inputs.shape[2]))
+        outputs[:, before : before + inputs.shape[1], :] = inputs
+    elif mode == "transposed":
+        outputs = inputs[:, before : inputs.shape[1] - after, :]
+    return outputs
+
+
+def stride1d(inputs: np.ndarray,
+             kernel: np.ndarray,
+             strides: int = 1,
+             padding: str = "valid",
+             mode: str = "normal"
+             ) -> np.ndarray:
+    """
+    Compute the strided output tensor for the convolution operation
+    or zeros inserted (unstrided) input tensor for the transposed convolution operation.
+
+    Parameters
+    ----------
+    inputs : numpy.ndarray
+        Input tensor with batch size. (batch size, height, channels)
+    kernel : numpy.ndarray
+        Kernel of the convolution operation.
+        Returned by model.layer[i].get_weights()[0] for a keras model.
+    strides  : int
+        Strides of the convolution operation
+    padding : str
+        Padding of the convolution operation. 'valid' or 'same' are supported.
+    mode : str
+        Mode of the convolution operation.
+        'normal' means normal convolution and 'transposed' means transposed convolution.
+
+    Returns
+    -------
+    numpy.ndarray
+        Strided output tensor with batch size. (batch size, convolutions, filters)
+        or unstrided input tensor with batch size. (batch size, height, channels).
+    """
+    # if strides == 1:
+    #     return inputs
+    ksize = kernel.shape[0]
+    if mode == "normal":
+        if padding == "valid":
+            outputs = inputs[:, ::strides, :]
+        elif padding == "same":
+            total_skip = (inputs.shape[1] - 1) % strides
+            if total_skip % 2 == 0:
+                start_ind = total_skip // 2
+            else:
+                start_ind = total_skip // 2 + (ksize % 2)
+            outputs = inputs[:, start_ind::strides, :]
+    elif mode == "transposed":
+        outputs = np.zeros(
+            (
+                inputs.shape[0],
+                inputs.shape[1] + (inputs.shape[1] - 1) * (strides - 1),
+                inputs.shape[2],
+            )
+        )
+        outputs[:, ::strides, :] = inputs
+    return outputs
+
+
+def unroll_kernel1d(input_shape: tuple[int, int],
+                    kernel: np.ndarray,
+                    strides: int = 1,
+                    padding: str = "valid",
+                    mode: str = "normal"
+                    ) -> np.ndarray:
+    """
+    Get the unrolled kernel for the convolution or transposed convolution operations.
+
+    Parameters
+    ----------
+    input_shape : tuple[int, int]
+        Shape of the input tensor without batch size. (height, channels)
+    kernel : numpy.ndarray
+        Kernel of the convolution operation.
+        Returned by model.layer[i].get_weights()[0] for a keras model.
+    strides : int
+        Strides of the convolution operation
+    padding : str
+        Padding of the convolution operation. 'valid' or 'same' are supported.
+    mode : str
+        Mode of the convolution operation. 'normal' means normal convolution and
+        'transposed' means transposed convolution.
+
+    Returns
+    -------
+    numpy.npdarray
+        In normal mode: unrolled kernel for the convolution operation.
+            (channels, no of convolutions, filters, padded input size)
+        In transposed mode: unrolled kernel for the transposed convolution operation.
+            (channels, zero inserted input size + kernel size - 1, filters,
+             zero inserted input size)
+
+    Note
+    ----
+    In current implementation, the unrolled kernel in normal mode would only work for
+    strides=1 case and it is assumed that the striding is done after the convolution
+    operation.
+
+    In current implementation, the unrolled kernel in transposed mode would only work
+    for padding='valid' case and it is assumed that any cropping for padding='same'
+    is done after the transposed convolution operation.
+
+    """
+    nconv, nfilters = out_shape(
+        input_shape, kernel, strides=strides, padding=padding, mode=mode
+    )
+    nh, nc = input_shape
+    ksize = kernel.shape[0]
+    if mode == "normal":
+        if padding == "valid":
+            p_2 = 0
+        elif padding == "same":
+            p_2 = ksize - 1
+        total_inp_size = nh + p_2
+    elif mode == "transposed":
+        total_inp_size = nh + (nh - 1) * (strides - 1)
+    unrolled_kernel = np.zeros((nc, nconv, nfilters, total_inp_size), dtype=np.float32)
+    if mode == "normal":
+        if padding == "valid":
+            start_ind = 0
+        elif padding == "same":
+            # nconv_stride1 = ih + p_2 - ksize + 1 = total_inp_size - ksize + 1
+            # total_skip = (nconv_stride1 - 1) % strides
+            total_skip = (total_inp_size - ksize) % strides
+            if total_skip % 2 == 0:
+                start_ind = total_skip // 2
+            else:
+                start_ind = total_skip // 2 + (ksize % 2)
+        for ch in range(nc):
+            for i in range(nconv):
+                # Go through only those convs which are not removed in stride1d later
+                i_s = start_ind + i * strides
+                for f in range(nfilters):
+                    if mode == "normal":
+                        unrolled_kernel[ch, i, f, i_s : i_s + ksize] += kernel[:, ch, f]
+        return unrolled_kernel
+    elif mode == "transposed":
+        # if cropping was done later, start_ind is the start point in cropping
+        if padding == "valid":
+            start_ind = 0
+        elif padding == "same":
+            out_crop = ksize - strides
+            start_ind = out_crop // 2
+        for ch in range(nc):
+            for f in range(nfilters):
+                for i in range(total_inp_size):
+                    cropped_start = max(0, i - start_ind)
+                    cropped_end = min(nconv, i + ksize - start_ind)
+                    ker_start = cropped_start - (i - start_ind)
+                    ker_end = ker_start + cropped_end - cropped_start
+                    unrolled_kernel[ch, cropped_start:cropped_end, f, i] += kernel[
+                        ker_start:ker_end, f, ch
+                    ]
+    return unrolled_kernel
+
+
+def conv1d(inputs: np.ndarray,
+           kernel: np.ndarray,
+           strides: int = 1,
+           padding: str = "valid"
+           ) -> np.ndarray:
+    """
+    Compute the output tensor of the convolution operation.
+
+    Parameters
+    ----------
+    inputs : numpy.ndarray
+        Input tensor with batch size. (batch size, height, channels)
+    kernel : numpy.ndarray
+        Kernel of the convolution operation.
+        Returned by model.layer[i].get_weights()[0] for a keras model.
+    strides : int
+        Strides of the convolution operation
+    padding : str
+        Padding of the convolution operation. 'valid' or 'same' are supported.
+
+    Returns
+    -------
+    numpy.ndarray
+        Output tensor with batch size. (batch size, convolutions, filters)
+    """
+    n_batches, n_height, n_channels = inputs.shape
+    outputs = np.zeros(
+        (n_batches, *out_shape(
+            (n_height, n_channels),
+             kernel,
+             strides=strides,
+             padding=padding,
+             mode="normal"),
+         )
+    )
+    padded_input = padding1d(inputs, kernel, strides=strides, padding=padding,
+                             mode="normal")
+    unrolled_kernel = unroll_kernel1d((n_height, n_channels), kernel,
+                                      strides=strides, padding=padding, mode="normal")
+    _, n_convs, _, n_padded_inp = unrolled_kernel.shape
+    assert n_padded_inp == padded_input.shape[1]
+    for batch in range(n_batches):
+        for ch in range(n_channels):
+            flat_inp = np.zeros((n_padded_inp, 1))
+            flat_inp[:, 0] = padded_input[batch, :, ch]
+            for i in range(n_convs):
+                outputs[batch, i, :] += np.matmul(
+                    unrolled_kernel[ch, i, :, :], flat_inp
+                ).flatten()
+    return outputs
+    # return stride1d(outputs, kernel, strides=strides, padding=padding, mode='normal')
+    # Needed when unrolled kernel is first calculated with strides=1
+
+
+def conv1d_transpose(inputs: np.ndarray,
+                     kernel: np.ndarray,
+                     strides: int = 1,
+                     padding: str = "valid"
+                     ) -> np.ndarray:
+    """
+    Compute the output tensor of the transposed convolution operation.
+
+    Parameters
+    ----------
+    inputs : numpy.ndarray
+        Input tensor with batch size. (batch size, height, channels)
+    kernel : numpy.ndarray
+        Kernel of the transposed convolution operation.
+        Returned by model.layer[i].get_weights()[0] for a keras model.
+    strides : int
+        Strides of the transposed convolution operation
+    padding : str
+        Padding of the transposed convolution operation.
+        'valid' or 'same' are supported.
+
+    Returns
+    -------
+    numpy.ndarray:
+        Output tensor with batch size. (batch size, height, channels)
+    """
+    n_batches, n_height, n_channels = inputs.shape
+    outputs = np.zeros(
+        (
+            n_batches,
+            *out_shape(
+                (n_height, n_channels),
+                kernel,
+                strides=strides,
+                padding=padding,
+                mode="transposed",
+            ),
+        )
+    )
+    strided_input = stride1d(
+        inputs, kernel, strides=strides, padding=padding, mode="transposed"
+    )
+    unrolled_kernel = unroll_kernel1d(
+        (n_height, n_channels),
+        kernel,
+        strides=strides,
+        padding=padding,
+        mode="transposed",
+    )
+    _, _, n_filters, n_strided_inp = unrolled_kernel.shape
+    assert n_strided_inp == strided_input.shape[1]
+    for batch in range(n_batches):
+        for ch in range(n_channels):
+            flat_inp = np.zeros((n_strided_inp, 1))
+            flat_inp[:, 0] = strided_input[batch, :, ch]
+            for f in range(n_filters):
+                outputs[batch, :, f] += np.matmul(
+                    unrolled_kernel[ch, :, f, :], flat_inp
+                ).flatten()
+    return outputs
+    # padding1d(outputs, kernel, strides=strides, padding=padding, mode='transposed')
+    # Needed when unrolled kernel is first calculated with padding='valid'
+
+
+def conv1d_transpose_direct(inputs: np.ndarray,
+                            kernel: np.ndarray,
+                            strides: int = 1,
+                            padding: str = "valid"
+                            ) -> np.ndarray:
+    """
+    True flat array based direct implementation of transposed convolution operation.
+    This function can be converted to C code directly.
+
+    Parameters
+    ----------
+    inputs :numpy.ndarray
+        Input tensor with batch size. (batch size, height, channels)
+    kernel : numpy.ndarray
+        Kernel of the transposed convolution operation.
+        Returned by model.layer[i].get_weights()[0] for a keras model.
+    strides : int
+        Strides of the transposed convolution operation
+    padding : str
+        Padding of the transposed convolution operation.
+        'valid' or 'same' are supported.
+
+    Returns
+    -------
+    numpy.ndarray
+        Output tensor with batch size.
+        (batch size, number of convolutions, number of filters)
+    """
+    n_batches, n_height, n_channels = inputs.shape
+    ksize, n_filters, kc = kernel.shape
+    outputs = np.zeros(
+        (
+            n_batches,
+            *out_shape(
+                (n_height, n_channels),
+                kernel,
+                strides=strides,
+                padding=padding,
+                mode="transposed",
+            ),
+        )
+    )
+    _, n_conv, _ = outputs.shape
+    if padding == "valid":
+        p_2 = 0
+    elif padding == "same":
+        p_2 = ksize - strides
+    si = p_2 // 2
+    output_flat = np.zeros((n_batches * n_conv * n_filters))
+    k_flat = kernel.flatten(order="C")
+    i_flat = inputs.flatten(order="C")
+    for b in range(n_batches):
+        for f in range(n_filters):
+            for ch in range(n_channels):
+                for t in range(n_height):
+                    if t * strides > si:
+                        cs = t * strides - si
+                    else:
+                        cs = 0
+                    if t * strides + ksize - si > n_conv:
+                        ce = n_conv
+                    else:
+                        ce = t * strides + ksize - si
+                    ks = cs - (t * strides - si)
+                    for i in range(0, ce - cs):
+                        output_flat[
+                            b * n_conv * n_filters + (cs + i) * n_filters + f
+                        ] += (
+                            k_flat[(i + ks) * n_filters * kc + f * kc + ch]
+                            * i_flat[b * n_height * n_channels + t * n_channels + ch]
+                        )
+    return output_flat.reshape((n_batches, n_conv, n_filters))
+    # return outputs
+
+
+if __name__ == "__main__":
+    mn = 0
+    mt = 0
+    pv = 0
+    ps = 0
+    for _ in range(50):
+        nb = np.random.randint(1, 10)
+        nh = np.random.randint(2, 50)
+        nc = np.random.randint(1, 50)
+        nf = np.random.randint(1, 50)
+        nk = np.random.randint(1, nh)
+        strides = np.random.randint(1, max(nk, 2))
+        if np.random.randint(2):
+            padding = "valid"
+            pv += 1
+        else:
+            padding = "same"
+            ps += 1
+
+        if np.random.randint(2):
+            mode = "normal"
+            mn += 1
+        else:
+            mode = "transposed"
+            mt += 1
+
+        t_a = keras.layers.Input((nh, nc))
+        if mode == "normal":
+            t_b = keras.layers.Conv1D(
+                filters=nf, kernel_size=nk, padding=padding, strides=strides
+            )(t_a)
+        elif mode == "transposed":
+            t_b = keras.layers.Conv1DTranspose(
+                filters=nf, kernel_size=nk, padding=padding, strides=strides
+            )(t_a)
+        mod = keras.models.Model(inputs=t_a, outputs=t_b)
+        ker = mod.layers[1].get_weights()[0]
+        inp = np.random.random((nb, nh, nc))
+        out = mod.predict(inp)
+        try:
+            calc_out_shape = out_shape(
+                (nh, nc), ker, padding=padding, strides=strides, mode=mode
+            )
+        except BaseException:
+            print("-" * 50)
+            print("Python Error in calculating output shape")
+            traceback.print_exc()
+            error = True
+
+        error = False
+        if calc_out_shape != out.shape[1:]:
+            print("-" * 50)
+            print("Error in calculating output shape")
+            error = True
+
+        try:
+            t_start = time.time()
+            if mode == "normal":
+                out2 = conv1d(inp, ker, strides=strides, padding=padding)
+            elif mode == "transposed":
+                out2 = conv1d_transpose(inp, ker, strides=strides, padding=padding)
+            t_end = time.time()
+            print(
+                "Numpy version time taken:                  {:.0f} ms".format(
+                    (t_end - t_start) * 1e3
+                )
+            )
+            if mode == "transposed":
+                t_d_start = time.time()
+                out3 = conv1d_transpose_direct(
+                    inp, ker, strides=strides, padding=padding
+                )
+                t_d_end = time.time()
+                print(
+                    "Direct version time taken:                {:.0f} ms".format(
+                        (t_d_end - t_d_start) * 1e3
+                    )
+                )
+
+        except BaseException:
+            print("-" * 50)
+            print("Python Error in calculating output")
+            traceback.print_exc()
+            error = True
+
+        if out.shape != out2.shape:
+            print("-" * 50)
+            print("Error in returned output shape")
+            error = True
+        elif np.abs(out - out2).max() > 3e-6:
+            print("-" * 50)
+            print("Error in output values")
+            error = True
+        elif mode == "transposed":
+            if np.abs(out - out3).max() > 3e-6:
+                print("-" * 50)
+                print("Error in direct output values")
+                error = True
+
+        if error:
+            print("Batch size:", nb)
+            print("Input shape:", (nh, nc))
+            print("Number of filters:", nf)
+            print("Kernel size:", nk)
+            print("Strides:", strides)
+            print("Padding:", padding)
+            print("Mode:", mode)
+            print("Expected output shape:", out.shape[1:])
+            print("Calculated output shape:", calc_out_shape)
+            print("Returned output shape:", out2.shape[1:])
+            print('Output mismatch error:', np.abs(out - out2).max())
+            if mode == 'transposed':
+                print('Direct output mismatch error:', np.abs(out - out3).max(), np.abs(out2 - out3).max())
+            break
+    if not error:
+        print("Tested with {} normal, {} transposed, {} valid and {} same padding cases".format(mn, mt, pv, ps))
+        print('All tests passed!')
diff --git a/tests/test_advanced_activation_layers.py b/tests/test_advanced_activation_layers.py
index c9e81aa..cb36c6e 100644
--- a/tests/test_advanced_activation_layers.py
+++ b/tests/test_advanced_activation_layers.py
@@ -6,14 +6,20 @@
 #!/usr/bin/env python3
 
 import unittest
-import tensorflow.keras as keras
+import keras
 from keras2c import keras2c_main
-import subprocess
 import time
-import os
 from test_core_layers import build_and_run
-import tensorflow as tf
-tf.compat.v1.disable_eager_execution()
+
+
+def _has_activation(name):
+    """True if the installed Keras recognizes the given activation identifier."""
+    try:
+        keras.activations.get(name)
+        return True
+    except (ValueError, TypeError):
+        return False
+
 
 __author__ = "Rory Conlin"
 __copyright__ = "Copyright 2020, Rory Conlin"
@@ -25,6 +31,16 @@
 class TestAdvancedActivation(unittest.TestCase):
     """tests for advanced activation layers"""
 
+    def test_swish(self):
+        inshp = (9, 7, 6, 3)
+        a = keras.layers.Input(inshp)
+        b = keras.layers.Activation('swish')(a)
+        model = keras.models.Model(inputs=a, outputs=b)
+        name = 'test___swish' + str(int(time.time()))
+        keras2c_main.k2c(model, name)
+        rcode = build_and_run(name)
+        self.assertEqual(rcode, 0)
+
     def test_LeakyReLU(self):
         inshp = (9, 7, 6, 3)
         alpha = 0.5
@@ -61,7 +77,7 @@ def test_ThresholdedReLU(self):
         inshp = (3, 6, 19, 11)
         theta = 0.3
         a = keras.layers.Input(inshp)
-        b = keras.layers.ThresholdedReLU(theta=theta)(a)
+        b = keras.layers.ReLU(threshold=theta)(a)
         model = keras.models.Model(inputs=a, outputs=b)
         name = 'test___ThresholdedReLU' + str(int(time.time()))
         keras2c_main.k2c(model, name)
@@ -82,3 +98,162 @@ def test_ReLU(self):
         keras2c_main.k2c(model, name)
         rcode = build_and_run(name)
         self.assertEqual(rcode, 0)
+
+    def test_ReLU_after_Dense(self):
+        # Regression: advanced-activation layer fed by a non-input layer must
+        # emit `tensor.array`, not `&tensor.array`, in the generated C call.
+        a = keras.layers.Input((4,))
+        b = keras.layers.Dense(8)(a)
+        c = keras.layers.ReLU()(b)
+        model = keras.models.Model(inputs=a, outputs=c)
+        name = 'test___ReLU_after_Dense' + str(int(time.time()))
+        keras2c_main.k2c(model, name)
+        rcode = build_and_run(name)
+        self.assertEqual(rcode, 0)
+
+    def test_selu(self):
+        inshp = (8, 6, 5)
+        a = keras.layers.Input(inshp)
+        b = keras.layers.Activation('selu')(a)
+        model = keras.models.Model(inputs=a, outputs=b)
+        name = 'test___SILU' + str(int(time.time()))
+        keras2c_main.k2c(model, name)
+        rcode = build_and_run(name)
+        self.assertEqual(rcode, 0)
+
+    def test_elu_activation(self):
+        inshp = (10, 7, 4)
+        a = keras.layers.Input(inshp)
+        b = keras.layers.Dense(12, activation='elu')(a)
+        model = keras.models.Model(inputs=a, outputs=b)
+        name = 'test___elu_act' + str(int(time.time()))
+        keras2c_main.k2c(model, name)
+        rcode = build_and_run(name)
+        self.assertEqual(rcode, 0)
+
+    def test_gelu(self):
+        inshp = (7, 11, 3)
+        a = keras.layers.Input(inshp)
+        b = keras.layers.Activation('gelu')(a)
+        model = keras.models.Model(inputs=a, outputs=b)
+        name = 'test___gelu' + str(int(time.time()))
+        keras2c_main.k2c(model, name)
+        rcode = build_and_run(name)
+        self.assertEqual(rcode, 0)
+
+    @unittest.skipUnless(_has_activation('hard_silu'), "Keras < 3.x lacks 'hard_silu'")
+    def test_hard_silu(self):
+        inshp = (6, 9, 4)
+        a = keras.layers.Input(inshp)
+        b = keras.layers.Activation('hard_silu')(a)
+        model = keras.models.Model(inputs=a, outputs=b)
+        name = 'test___hard_silu' + str(int(time.time()))
+        keras2c_main.k2c(model, name)
+        rcode = build_and_run(name)
+        self.assertEqual(rcode, 0)
+
+    def test_mish(self):
+        inshp = (5, 8, 7)
+        a = keras.layers.Input(inshp)
+        b = keras.layers.Activation('mish')(a)
+        model = keras.models.Model(inputs=a, outputs=b)
+        name = 'test___mish' + str(int(time.time()))
+        keras2c_main.k2c(model, name)
+        rcode = build_and_run(name)
+        self.assertEqual(rcode, 0)
+
+    def test_relu6(self):
+        inshp = (10, 6, 3)
+        a = keras.layers.Input(inshp)
+        b = keras.layers.Activation('relu6')(a)
+        model = keras.models.Model(inputs=a, outputs=b)
+        name = 'test___relu6' + str(int(time.time()))
+        keras2c_main.k2c(model, name)
+        rcode = build_and_run(name)
+        self.assertEqual(rcode, 0)
+
+    def test_log_softmax(self):
+        inshp = (8, 12)
+        a = keras.layers.Input(inshp)
+        b = keras.layers.Activation('log_softmax')(a)
+        model = keras.models.Model(inputs=a, outputs=b)
+        name = 'test___log_softmax' + str(int(time.time()))
+        keras2c_main.k2c(model, name)
+        rcode = build_and_run(name)
+        self.assertEqual(rcode, 0)
+
+    def test_leaky_relu_activation(self):
+        inshp = (7, 5, 9)
+        a = keras.layers.Input(inshp)
+        b = keras.layers.Dense(8, activation='leaky_relu')(a)
+        model = keras.models.Model(inputs=a, outputs=b)
+        name = 'test___leaky_relu_act' + str(int(time.time()))
+        keras2c_main.k2c(model, name)
+        rcode = build_and_run(name)
+        self.assertEqual(rcode, 0)
+
+    @unittest.skipUnless(_has_activation('celu'), "Keras version lacks 'celu'")
+    def test_celu(self):
+        inshp = (6, 10, 4)
+        a = keras.layers.Input(inshp)
+        b = keras.layers.Activation('celu')(a)
+        model = keras.models.Model(inputs=a, outputs=b)
+        name = 'test___celu' + str(int(time.time()))
+        keras2c_main.k2c(model, name)
+        rcode = build_and_run(name)
+        self.assertEqual(rcode, 0)
+
+    @unittest.skipUnless(_has_activation('hard_tanh'), "Keras version lacks 'hard_tanh'")
+    def test_hard_tanh(self):
+        inshp = (9, 7, 5)
+        a = keras.layers.Input(inshp)
+        b = keras.layers.Activation('hard_tanh')(a)
+        model = keras.models.Model(inputs=a, outputs=b)
+        name = 'test___hard_tanh' + str(int(time.time()))
+        keras2c_main.k2c(model, name)
+        rcode = build_and_run(name)
+        self.assertEqual(rcode, 0)
+
+    @unittest.skipUnless(_has_activation('hard_shrink'), "Keras version lacks 'hard_shrink'")
+    def test_hard_shrink(self):
+        inshp = (8, 6, 3)
+        a = keras.layers.Input(inshp)
+        b = keras.layers.Activation('hard_shrink')(a)
+        model = keras.models.Model(inputs=a, outputs=b)
+        name = 'test___hard_shrink' + str(int(time.time()))
+        keras2c_main.k2c(model, name)
+        rcode = build_and_run(name)
+        self.assertEqual(rcode, 0)
+
+    @unittest.skipUnless(_has_activation('soft_shrink'), "Keras version lacks 'soft_shrink'")
+    def test_soft_shrink(self):
+        inshp = (7, 5, 4)
+        a = keras.layers.Input(inshp)
+        b = keras.layers.Activation('soft_shrink')(a)
+        model = keras.models.Model(inputs=a, outputs=b)
+        name = 'test___soft_shrink' + str(int(time.time()))
+        keras2c_main.k2c(model, name)
+        rcode = build_and_run(name)
+        self.assertEqual(rcode, 0)
+
+    @unittest.skipUnless(_has_activation('squareplus'), "Keras version lacks 'squareplus'")
+    def test_squareplus(self):
+        inshp = (5, 9, 3)
+        a = keras.layers.Input(inshp)
+        b = keras.layers.Activation('squareplus')(a)
+        model = keras.models.Model(inputs=a, outputs=b)
+        name = 'test___squareplus' + str(int(time.time()))
+        keras2c_main.k2c(model, name)
+        rcode = build_and_run(name)
+        self.assertEqual(rcode, 0)
+
+    @unittest.skipUnless(_has_activation('sparse_plus'), "Keras version lacks 'sparse_plus'")
+    def test_sparse_plus(self):
+        inshp = (6, 8, 4)
+        a = keras.layers.Input(inshp)
+        b = keras.layers.Activation('sparse_plus')(a)
+        model = keras.models.Model(inputs=a, outputs=b)
+        name = 'test___sparse_plus' + str(int(time.time()))
+        keras2c_main.k2c(model, name)
+        rcode = build_and_run(name)
+        self.assertEqual(rcode, 0)
diff --git a/tests/test_checks.py b/tests/test_checks.py
index a5f9108..6af8424 100644
--- a/tests/test_checks.py
+++ b/tests/test_checks.py
@@ -6,13 +6,10 @@
 #!/usr/bin/env python3
 
 import unittest
-import tensorflow.keras as keras
+import keras
 from keras2c import keras2c_main
-import subprocess
-import time
 import numpy as np
-import tensorflow as tf
-tf.compat.v1.disable_eager_execution()
+
 
 __author__ = "Rory Conlin"
 __copyright__ = "Copyright 2020, Rory Conlin"
@@ -32,7 +29,7 @@ def test_is_model(self):
     def test_is_valid_cname(self):
         inshp = (10, 8)
         name = 'foobar'
-        a = keras.layers.Input(inshp, name='f/oo')
+        a = keras.layers.Input(inshp, name='1foo')
         b = keras.layers.Dense(10)(a)
         model = keras.models.Model(inputs=a, outputs=b)
         self.assertRaises(AssertionError, keras2c_main.k2c, model, name)
diff --git a/tests/test_convolution_layers.py b/tests/test_convolution_layers.py
index 2952125..e5b8c18 100644
--- a/tests/test_convolution_layers.py
+++ b/tests/test_convolution_layers.py
@@ -6,21 +6,22 @@
 #!/usr/bin/env python3
 
 import unittest
-import tensorflow.keras as keras
+import keras
 from keras2c import keras2c_main
-import subprocess
 import time
-import os
 from test_core_layers import build_and_run
-import tensorflow as tf
-tf.compat.v1.disable_eager_execution()
 
-__author__ = "Rory Conlin"
-__copyright__ = "Copyright 2020, Rory Conlin"
-__license__ = "MIT"
-__maintainer__ = "Rory Conlin, https://github.com/f0uriest/keras2c"
-__email__ = "wconlin@princeton.edu"
 
+# Original author
+# __author__ = "Rory Conlin"
+# __copyright__ = "Copyright 2020, Rory Conlin"
+# __license__ = "MIT"
+# __maintainer__ = "Rory Conlin, https://github.com/f0uriest/keras2c"
+# __email__ = "wconlin@princeton.edu"
+
+# Modified by
+__author__ = "Anchal Gupta"
+__email__ = "guptaa@fusion.gat.com"
 
 class TestConvolutionLayers(unittest.TestCase):
     """tests for convolution layers"""
@@ -222,7 +223,10 @@ def test_ZeroPad3D(self):
         pad_back = 4
         a = keras.layers.Input(inshp)
         b = keras.layers.ZeroPadding3D(
-            padding=((pad_top, pad_bottom), (pad_left, pad_right), (pad_front, pad_back)))(a)
+            padding=((pad_top, pad_bottom),
+                     (pad_left, pad_right),
+                     (pad_front, pad_back))
+        )(a)
         model = keras.models.Model(inputs=a, outputs=b)
         name = 'test___ZeroPad3D' + str(int(time.time()))
         keras2c_main.k2c(model, name)
@@ -269,7 +273,10 @@ def test_Cropping3D(self):
         crop_back = 0
         a = keras.layers.Input(inshp)
         b = keras.layers.Cropping3D(
-            cropping=((crop_top, crop_bottom), (crop_left, crop_right), (crop_front, crop_back)))(a)
+            cropping=((crop_top, crop_bottom),
+                      (crop_left, crop_right),
+                      (crop_front, crop_back))
+        )(a)
         model = keras.models.Model(inputs=a, outputs=b)
         name = 'test___Cropping3D' + str(int(time.time()))
         keras2c_main.k2c(model, name)
diff --git a/tests/test_convtranspose_layers.py b/tests/test_convtranspose_layers.py
new file mode 100644
index 0000000..197a098
--- /dev/null
+++ b/tests/test_convtranspose_layers.py
@@ -0,0 +1,52 @@
+"""test_convolution_layers.py
+This file is part of the test suite for keras2c
+Implements tests for convolution layers
+"""
+
+#!/usr/bin/env python3
+
+import unittest
+import keras
+from keras2c import keras2c_main
+import time
+from test_core_layers import build_and_run
+
+import numpy as np
+
+__author__ = "Anchal Gupta"
+__copyright__ = "Copyright 2024, Anchal Gupta"
+__license__ = "MIT"
+__maintainer__ = "Anchal Gupta, https://github.com/anchal-physics/keras2c"
+__email__ = "guptaa@fusion.gat.com"
+
+
+class TestConvolutionTransposeLayers(unittest.TestCase):
+    """tests for convolution layers"""
+
+    def test_Conv1DTranspose1(self):
+        for tno in range (10):
+            nh = np.random.randint(2, 50)
+            nc = np.random.randint(1, 50)
+            nf = np.random.randint(1, 50)
+            nk = np.random.randint(1, nh)
+            strides = np.random.randint(1, max(nk, 2))
+            inshp = (nh, nc)
+            if tno % 2 == 0:
+                padding = 'valid'
+            else:
+                padding = 'same'
+            dilation_rate = 1
+            activation = None # 'relu'
+            a = keras.layers.Input(inshp)
+            b = keras.layers.Conv1DTranspose(filters=nf,
+                                             kernel_size=nk,
+                                             strides=strides,
+                                             padding=padding,
+                                             dilation_rate=dilation_rate,
+                                             activation=activation,
+                                             use_bias=False)(a)
+            model = keras.models.Model(inputs=a, outputs=b)
+            name = 'test___Conv1DTranspose1' + str(int(time.time()))
+            keras2c_main.k2c(model, name)
+            rcode = build_and_run(name)
+            self.assertEqual(rcode, 0)
diff --git a/tests/test_core_layers.py b/tests/test_core_layers.py
index 0bfb930..edd6b25 100644
--- a/tests/test_core_layers.py
+++ b/tests/test_core_layers.py
@@ -6,49 +6,82 @@
 #!/usr/bin/env python3
 
 import unittest
-import tensorflow.keras as keras
+import keras
 from keras2c import keras2c_main
 import subprocess
 import time
 import os
-import tensorflow as tf
-tf.compat.v1.disable_eager_execution()
+import shutil
+import platform
 
-__author__ = "Rory Conlin"
-__copyright__ = "Copyright 2020, Rory Conlin"
-__license__ = "MIT"
-__maintainer__ = "Rory Conlin, https://github.com/f0uriest/keras2c"
-__email__ = "wconlin@princeton.edu"
 
+# Original author
+# __author__ = "Rory Conlin"
+# __copyright__ = "Copyright 2020, Rory Conlin"
+# __license__ = "MIT"
+# __maintainer__ = "Rory Conlin, https://github.com/f0uriest/keras2c"
+# __email__ = "wconlin@princeton.edu"
 
-CC = 'gcc'
-
+# Modified by
+__author__ = "Anchal Gupta"
+__email__ = "guptaa@fusion.gat.com"
 
 def build_and_run(name, return_output=False):
 
-    cwd = os.getcwd()
-    os.chdir(os.path.abspath('./include/'))
-    lib_code = subprocess.run(['make']).returncode
-    os.chdir(os.path.abspath(cwd))
-    if lib_code != 0:
-        return 'lib build failed'
+    CC = 'gcc'
+
+    repo_path = os.path.abspath(
+        os.path.join(os.path.dirname(os.path.abspath(__file__)), '../')
+    )
+    include_path = os.path.join(repo_path, './include/')
 
     if os.environ.get('CI'):
-        ccflags = '-g -Og -std=c99 --coverage -I./include/'
+        ccflags = '-g -Og -std=c99 --coverage '
     else:
-        ccflags = '-Ofast -std=c99 -I./include/'
+        ccflags = '-Ofast -std=c99 '
 
-    cc = CC + ' ' + ccflags + ' -o ' + name + ' ' + name + '.c ' + \
+    cwd = os.getcwd()
+    in_repo_root = cwd == repo_path
+    if not in_repo_root:
+        shutil.copytree(include_path, './include')
+        include_path = os.path.abspath('./include/')
+
+    if platform.system() == 'Linux':
+        cwd = os.getcwd()
+        os.chdir('./include')
+        lib_code = subprocess.run(['make']).returncode
+        os.chdir(os.path.abspath(cwd))
+        if lib_code != 0:
+            return 'lib build failed'
+
+        cc = CC + ' ' + ccflags + ' -o ' + name + ' ' + name + '.c ' + \
         name + '_test_suite.c -L./include/ -l:libkeras2c.a -lm'
+    elif platform.system() == 'Darwin':
+        inc_files = ' '.join(
+            [os.path.join(include_path, f)
+             for f in os.listdir(include_path) if f.endswith('.c')])
+        cc = CC + ' ' + ccflags + ' -o ' + name + ' ' + name + '.c ' + \
+            name + '_test_suite.c ' + inc_files
+
+    elif platform.system() == 'Windows':
+        if shutil.which('gcc'):
+            exe_name = f"{name}.exe"
+            inc_files = ' '.join(
+                os.path.join(include_path, f)
+                for f in os.listdir(include_path) if f.endswith('.c'))
+            cc = f"{CC} {ccflags} -o {exe_name} {name}.c {name}_test_suite.c {inc_files}"
+        else:
+            return 'gcc not found'
     build_code = subprocess.run(cc.split()).returncode
     if build_code != 0:
         return 'build failed'
     proc_output = subprocess.run(['./' + name])
     rcode = proc_output.returncode
     if rcode == 0:
-        if not os.environ.get('CI'):
-            subprocess.run('rm ' + name + '*', shell=True)
-            return (rcode, proc_output.stdout) if return_output else rcode
+        subprocess.run('rm ' + name + '*', shell=True)
+        if not in_repo_root:
+            shutil.rmtree('./include')
+        return (rcode, proc_output.stdout) if return_output else rcode
     return rcode
 
 
@@ -218,6 +251,40 @@ def test_BatchNorm4(self):
         rcode = build_and_run(name)
         self.assertEqual(rcode, 0)
 
+    def test_BatchNorm_negative_axis_3D(self):
+        inshp = (10, 11, 12)
+        axis = -1
+        init = keras.initializers.RandomUniform(minval=0.1, maxval=1.0)
+        a = keras.layers.Input(inshp)
+        b = keras.layers.BatchNormalization(axis=axis,
+                                            beta_initializer=init,
+                                            gamma_initializer=init,
+                                            moving_mean_initializer=init,
+                                            moving_variance_initializer=init,
+                                            scale=True, center=True)(a)
+        model = keras.models.Model(inputs=a, outputs=b)
+        name = 'test___BatchNorm_negax3D' + str(int(time.time()))
+        keras2c_main.k2c(model, name)
+        rcode = build_and_run(name)
+        self.assertEqual(rcode, 0)
+
+    def test_BatchNorm_negative_axis_4D(self):
+        inshp = (10, 11, 12, 13)
+        axis = -1
+        init = keras.initializers.RandomUniform(minval=0.1, maxval=1.0)
+        a = keras.layers.Input(inshp)
+        b = keras.layers.BatchNormalization(axis=axis,
+                                            beta_initializer=init,
+                                            gamma_initializer=init,
+                                            moving_mean_initializer=init,
+                                            moving_variance_initializer=init,
+                                            scale=True, center=True)(a)
+        model = keras.models.Model(inputs=a, outputs=b)
+        name = 'test___BatchNorm_negax4D' + str(int(time.time()))
+        keras2c_main.k2c(model, name)
+        rcode = build_and_run(name)
+        self.assertEqual(rcode, 0)
+
 
 class TestSharedLayers(unittest.TestCase):
     """tests for shared layers"""
@@ -237,7 +304,3 @@ def test_SharedLayer1(self):
         keras2c_main.k2c(model, name)
         rcode = build_and_run(name)
         self.assertEqual(rcode, 0)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/test_malloc.py b/tests/test_malloc.py
index 9813555..338a765 100644
--- a/tests/test_malloc.py
+++ b/tests/test_malloc.py
@@ -6,14 +6,11 @@
 #!/usr/bin/env python3
 
 import unittest
-import tensorflow.keras as keras
+import keras
 from keras2c import keras2c_main
-import subprocess
 import time
-import os
 from test_core_layers import build_and_run
-import tensorflow as tf
-tf.compat.v1.disable_eager_execution()
+
 
 __author__ = "Rory Conlin"
 __copyright__ = "Copyright 2020, Rory Conlin"
@@ -61,7 +58,3 @@ def test_Malloc2(self):
         keras2c_main.k2c(model, name, malloc=True)
         rcode = build_and_run(name)
         self.assertEqual(rcode, 0)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/test_merge_layers.py b/tests/test_merge_layers.py
index 7ab2e8b..4c34ba0 100644
--- a/tests/test_merge_layers.py
+++ b/tests/test_merge_layers.py
@@ -8,12 +8,9 @@
 import unittest
 import tensorflow.keras as keras
 from keras2c import keras2c_main
-import subprocess
 import time
-import os
 from test_core_layers import build_and_run
-import tensorflow as tf
-tf.compat.v1.disable_eager_execution()
+
 
 __author__ = "Rory Conlin"
 __copyright__ = "Copyright 2020, Rory Conlin"
@@ -184,7 +181,3 @@ def test_Concatenate3(self):
         keras2c_main.k2c(model, name)
         rcode = build_and_run(name)
         self.assertEqual(rcode, 0)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/test_models.py b/tests/test_models.py
index 6b68ff8..66e6f4d 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -6,19 +6,14 @@
 #!/usr/bin/env python3
 
 import unittest
-import tensorflow.keras as keras
-from tensorflow.keras import models
-from tensorflow.keras import layers
-from tensorflow.keras.layers import Input, Dense, LSTM, Conv1D, Conv2D, ConvLSTM2D, Dot, Add, Multiply, Concatenate, Reshape, Permute, ZeroPadding1D, Cropping1D
-from tensorflow.keras.models import Model
-import numpy as np
+import keras
+from keras import layers
+from keras.layers import Input, Dense, LSTM, Conv2D, Add, Concatenate, Reshape, Permute
+from keras.models import Model
 from keras2c import keras2c_main
-import subprocess
 import time
-import os
 from test_core_layers import build_and_run
-import tensorflow as tf
-tf.compat.v1.disable_eager_execution()
+
 
 __author__ = "Rory Conlin"
 __copyright__ = "Copyright 2020, Rory Conlin"
@@ -151,30 +146,39 @@ def test_ProfilePredictorConv1D(self):
         prof_act = []
         for i in range(num_targets):
 
-            current_profiles_processed_1 = layers.Conv1D(filters=8, kernel_size=2,
-                                                         padding='same',
-                                                         activation='relu')(current_profiles_processed_0)
-            current_profiles_processed_2 = layers.Conv1D(filters=8, kernel_size=4,
-                                                         padding='same',
-                                                         activation='relu')(current_profiles_processed_1)
-            current_profiles_processed_3 = layers.Conv1D(filters=8, kernel_size=8,
-                                                         padding='same',
-                                                         activation='relu')(current_profiles_processed_2)
+            current_profiles_processed_1 = (
+                layers.Conv1D(
+                    filters=8, kernel_size=2, padding='same', activation='relu'
+                )(current_profiles_processed_0)
+            )
+            current_profiles_processed_2 = layers.Conv1D(
+                filters=8, kernel_size=4, padding='same', activation='relu'
+            )(current_profiles_processed_1)
+            current_profiles_processed_3 = layers.Conv1D(
+                filters=8, kernel_size=8, padding='same', activation='relu'
+            )(current_profiles_processed_2)
 
             final_output = layers.Concatenate()(
-                [current_profiles_processed_1, current_profiles_processed_2, current_profiles_processed_3])
-            final_output = layers.Conv1D(filters=10, kernel_size=4,
-                                         padding='same', activation='tanh')(final_output)
-            final_output = layers.Conv1D(filters=1, kernel_size=4,
-                                         padding='same', activation='linear')(final_output)
-            final_output = layers.Reshape(target_shape=(
-                profile_length,), name="target_"+target_profile_names[i])(final_output)
+                [current_profiles_processed_1,
+                 current_profiles_processed_2,
+                 current_profiles_processed_3]
+            )
+            final_output = layers.Conv1D(
+                filters=10, kernel_size=4, padding='same', activation='tanh'
+            )(final_output)
+            final_output = layers.Conv1D(
+                filters=1, kernel_size=4, padding='same', activation='linear'
+            )(final_output)
+            final_output = layers.Reshape(
+                target_shape=(profile_length,), name="target_"+target_profile_names[i]
+            )(final_output)
 
             prof_act.append(final_output)
         print(len(prof_act))
 
-        model = Model(inputs=profile_inputs + actuator_past_inputs +
-                      actuator_future_inputs, outputs=prof_act)
+        model = Model(
+            inputs=profile_inputs + actuator_past_inputs + actuator_future_inputs,
+            outputs=prof_act)
 
         name = 'test___ProfilePredictorConv1D' + str(int(time.time()))
         keras2c_main.k2c(model, name)
@@ -216,22 +220,27 @@ def test_ProfilePredictorConv2D(self):
         # shape = (lookback, length, channels=num_profiles)
         profiles = Conv2D(filters=int(num_profiles*max_channels/8),
                           kernel_size=(1, int(profile_length/12)),
-                          strides=(1, 1), padding='same', activation=std_activation)(profiles)
+                          strides=(1, 1), padding='same', activation=std_activation
+                          )(profiles)
         profiles = Conv2D(filters=int(num_profiles*max_channels/4),
                           kernel_size=(1, int(profile_length/8)),
-                          strides=(1, 1), padding='same', activation=std_activation)(profiles)
+                          strides=(1, 1), padding='same', activation=std_activation
+                          )(profiles)
         profiles = Conv2D(filters=int(num_profiles*max_channels/2),
                           kernel_size=(1, int(profile_length/6)),
-                          strides=(1, 1), padding='same', activation=std_activation)(profiles)
+                          strides=(1, 1), padding='same', activation=std_activation
+                          )(profiles)
         profiles = Conv2D(filters=int(num_profiles*max_channels),
                           kernel_size=(1, int(profile_length/4)),
-                          strides=(1, 1), padding='same', activation=std_activation)(profiles)
+                          strides=(1, 1), padding='same', activation=std_activation
+                          )(profiles)
         # shape = (lookback, length, channels)
         if profile_lookback > 1:
-            profiles = Conv2D(filters=int(num_profiles*max_channels), kernel_size=(profile_lookback, 1),
-                              strides=(1, 1), padding='valid', activation=std_activation)(profiles)
-        profiles = Reshape((profile_length, int(
-            num_profiles*max_channels)))(profiles)
+            profiles = Conv2D(filters=int(num_profiles*max_channels),
+                              kernel_size=(profile_lookback, 1),
+                              strides=(1, 1), padding='valid', activation=std_activation
+                              )(profiles)
+        profiles = Reshape((profile_length, int(num_profiles*max_channels)))(profiles)
         # shape = (length, channels)
 
         actuator_future_inputs = []
@@ -239,9 +248,11 @@ def test_ProfilePredictorConv2D(self):
         actuators = []
         for i in range(num_actuators):
             actuator_future_inputs.append(
-                Input(future_actuator_inshape, name='input_future_' + actuator_names[i]))
+                Input(future_actuator_inshape,
+                      name='input_future_' + actuator_names[i]))
             actuator_past_inputs.append(
-                Input(past_actuator_inshape, name='input_past_' + actuator_names[i]))
+                Input(past_actuator_inshape,
+                      name='input_past_' + actuator_names[i]))
             actuators.append(Concatenate(
                 axis=-1)([actuator_past_inputs[i], actuator_future_inputs[i]]))
             actuators[i] = Reshape(
@@ -250,16 +261,22 @@ def test_ProfilePredictorConv2D(self):
         # shaoe = (time, num_actuators)
         actuators = Dense(units=int(num_profiles*max_channels/8),
                           activation=std_activation)(actuators)
-        # actuators = Conv1D(filters=int(num_profiles*max_channels/8), kernel_size=3, strides=1,
-        #                    padding='causal', activation=std_activation)(actuators)
+        # actuators = Conv1D(filters=int(num_profiles*max_channels/8),
+        #                    kernel_size=3, strides=1,padding='causal',
+        #                    activation=std_activation
+        #                    )(actuators)
         actuators = Dense(units=int(num_profiles*max_channels/4),
                           activation=std_activation)(actuators)
-        # actuators = Conv1D(filters=int(num_profiles*max_channels/4), kernel_size=3, strides=1,
-        #                    padding='causal', activation=std_activation)(actuators)
+        # actuators = Conv1D(filters=int(num_profiles*max_channels/4),
+        #                    kernel_size=3, strides=1, padding='causal',
+        #                    activation=std_activation
+        #                    )(actuators)
         actuators = Dense(units=int(num_profiles*max_channels/2),
                           activation=std_activation)(actuators)
-        actuators = LSTM(units=int(num_profiles*max_channels), activation=std_activation,
-                         recurrent_activation='hard_sigmoid')(actuators)
+        actuators = LSTM(units=int(num_profiles*max_channels),
+                         activation=std_activation,
+                         recurrent_activation='hard_sigmoid'
+                         )(actuators)
         actuators = Reshape((int(num_profiles*max_channels), 1))(actuators)
         # shape = (channels, 1)
         actuators = Dense(units=int(profile_length/4),
@@ -278,22 +295,35 @@ def test_ProfilePredictorConv2D(self):
 
         prof_act = []
         for i in range(num_targets):
-            prof_act.append(Conv2D(filters=max_channels, kernel_size=(1, int(profile_length/4)), strides=(1, 1),
-                                   padding='same', activation=std_activation)(merged))
+            prof_act.append(
+                Conv2D(filters=max_channels, kernel_size=(1, int(profile_length/4)),
+                       strides=(1, 1), padding='same', activation=std_activation
+                       )(merged))
             # shape = (1,length,max_channels)
-            prof_act[i] = Conv2D(filters=int(max_channels/2), kernel_size=(1, int(profile_length/8)),
-                                 strides=(1, 1), padding='same', activation=std_activation)(prof_act[i])
-            prof_act[i] = Conv2D(filters=int(max_channels/4), kernel_size=(1, int(profile_length/6)),
-                                 strides=(1, 1), padding='same', activation=std_activation)(prof_act[i])
-            prof_act[i] = Conv2D(filters=int(max_channels/8), kernel_size=(1, int(profile_length/4)),
-                                 strides=(1, 1), padding='same', activation=std_activation)(prof_act[i])
-            prof_act[i] = Conv2D(filters=1, kernel_size=(1, int(profile_length/4)), strides=(1, 1),
-                                 padding='same', activation=None)(prof_act[i])
+            prof_act[i] = Conv2D(
+                filters=int(max_channels/2), kernel_size=(1, int(profile_length/8)),
+                strides=(1, 1), padding='same', activation=std_activation
+            )(prof_act[i])
+            prof_act[i] = Conv2D(
+                filters=int(max_channels/4), kernel_size=(1, int(profile_length/6)),
+                strides=(1, 1), padding='same', activation=std_activation
+            )(prof_act[i])
+            prof_act[i] = Conv2D(
+                filters=int(max_channels/8), kernel_size=(1, int(profile_length/4)),
+                strides=(1, 1), padding='same', activation=std_activation
+            )(prof_act[i])
+            prof_act[i] = Conv2D(
+                filters=1, kernel_size=(1, int(profile_length/4)), strides=(1, 1),
+                padding='same', activation=None
+            )(prof_act[i])
             # shape = (1,length,1)
-            prof_act[i] = Reshape((profile_length,), name='target_' +
-                                  target_profile_names[i])(prof_act[i])
-        model = Model(inputs=profile_inputs + actuator_past_inputs +
-                      actuator_future_inputs, outputs=prof_act)
+            prof_act[i] = Reshape(
+                (profile_length,), name='target_' + target_profile_names[i]
+            )(prof_act[i])
+        model = Model(
+            inputs=profile_inputs + actuator_past_inputs + actuator_future_inputs,
+            outputs=prof_act
+        )
         name = 'test___ProfilePredictorConv2D' + str(int(time.time()))
         keras2c_main.k2c(model, name)
         rcode = build_and_run(name)
@@ -337,5 +367,3 @@ def test_ProfilePredictorConv2D(self):
     #     keras2c_main.k2c(model, name)
     #     rcode = build_and_run(name)
     #     self.assertEqual(rcode, 0)
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/test_optimizations.py b/tests/test_optimizations.py
new file mode 100644
index 0000000..5c82740
--- /dev/null
+++ b/tests/test_optimizations.py
@@ -0,0 +1,209 @@
+"""test_optimizations.py
+This file is part of the test suite for keras2c
+Tests the codegen-time optimizations introduced in pcs_optimizations_static:
+  - BatchNormalization folding into Dense / Conv1D
+  - Non-mutation of the caller's model across a k2c() call
+"""
+
+#!/usr/bin/env python3
+
+import os
+import time
+import unittest
+
+import keras
+import numpy as np
+from keras import layers
+
+from keras2c import keras2c_main
+from keras2c.keras2c_main import fold_batch_norms
+
+
+def _bn(scale=True, center=True, zero_offset=False):
+    """BatchNormalization with reproducible non-trivial weights.
+
+    zero_offset=True forces beta=0 and moving_mean=0 so that bn_offset==0,
+    which is the regime where folding is mathematically safe even when the
+    downstream conv pads its inputs with zeros.
+    """
+    init = keras.initializers.RandomUniform(minval=0.1, maxval=1.0)
+    return layers.BatchNormalization(
+        scale=scale,
+        center=center,
+        beta_initializer='zeros' if zero_offset else init,
+        gamma_initializer=init,
+        moving_mean_initializer='zeros' if zero_offset else init,
+        moving_variance_initializer=init,
+    )
+
+
+def _build_bn_dense(input_dim=8, units=16, use_bias=True, scale=True, center=True,
+                    zero_offset=False):
+    inp = keras.Input((input_dim,))
+    x = _bn(scale=scale, center=center, zero_offset=zero_offset)(inp)
+    out = layers.Dense(units, use_bias=use_bias,
+                       kernel_initializer='glorot_uniform',
+                       bias_initializer='zeros')(x)
+    return keras.Model(inp, out)
+
+
+def _build_bn_conv1d(time=20, in_channels=4, filters=8, kernel_size=3,
+                     padding='valid', use_bias=True, scale=True, center=True,
+                     zero_offset=False):
+    inp = keras.Input((time, in_channels))
+    x = _bn(scale=scale, center=center, zero_offset=zero_offset)(inp)
+    out = layers.Conv1D(filters=filters, kernel_size=kernel_size,
+                        padding=padding, use_bias=use_bias,
+                        kernel_initializer='glorot_uniform',
+                        bias_initializer='zeros')(x)
+    return keras.Model(inp, out)
+
+
+def _clone(model):
+    c = keras.models.clone_model(model)
+    c.set_weights(model.get_weights())
+    return c
+
+
+class TestBatchNormFolding(unittest.TestCase):
+    """Folding logic in keras2c_main.fold_batch_norms — Python-only, no C build."""
+
+    def _assert_outputs_match(self, ref, folded, x, atol=1e-5):
+        y1 = ref.predict(x, verbose=0)
+        y2 = folded.predict(x, verbose=0)
+        diff = float(np.max(np.abs(y1 - y2)))
+        self.assertLess(diff, atol,
+                        msg=f'output diverges after fold: max abs diff={diff:.3e}')
+
+    def test_fold_bn_into_dense(self):
+        model = _build_bn_dense()
+        ref = _clone(model)
+        folded = fold_batch_norms(model, verbose=False)
+        self.assertEqual(len(folded), 1, 'BN before Dense should fold')
+        x = np.random.randn(4, 8).astype(np.float32)
+        self._assert_outputs_match(ref, model, x)
+
+    def test_fold_bn_into_conv1d_valid(self):
+        model = _build_bn_conv1d(padding='valid')
+        ref = _clone(model)
+        folded = fold_batch_norms(model, verbose=False)
+        self.assertEqual(len(folded), 1, "BN before Conv1D padding='valid' should fold")
+        x = np.random.randn(2, 20, 4).astype(np.float32)
+        self._assert_outputs_match(ref, model, x)
+
+    def test_fold_bn_into_conv1d_same_is_safe(self):
+        # padding='same' pads edges with zeros, so a constant bn_offset does
+        # not propagate uniformly through the kernel — folding is mathematically
+        # unsafe at edge outputs. fold_batch_norms must skip this case so the
+        # generated output stays exact.
+        model = _build_bn_conv1d(padding='same')
+        ref = _clone(model)
+        folded = fold_batch_norms(model, verbose=False)
+        self.assertEqual(len(folded), 0,
+                         "BN before Conv1D padding='same' must not be folded")
+        x = np.random.randn(2, 20, 4).astype(np.float32)
+        self._assert_outputs_match(ref, model, x, atol=1e-6)
+
+    def test_fold_bn_into_conv1d_causal_is_safe(self):
+        model = _build_bn_conv1d(padding='causal')
+        ref = _clone(model)
+        folded = fold_batch_norms(model, verbose=False)
+        self.assertEqual(len(folded), 0,
+                         "BN before Conv1D padding='causal' must not be folded")
+        x = np.random.randn(2, 20, 4).astype(np.float32)
+        self._assert_outputs_match(ref, model, x, atol=1e-6)
+
+    def test_fold_bn_into_conv1d_same_kernel1_ok(self):
+        # kernel_size=1 has no real padding so 'same' folds correctly.
+        model = _build_bn_conv1d(padding='same', kernel_size=1)
+        ref = _clone(model)
+        folded = fold_batch_norms(model, verbose=False)
+        self.assertEqual(len(folded), 1)
+        x = np.random.randn(2, 20, 4).astype(np.float32)
+        self._assert_outputs_match(ref, model, x)
+
+    def test_fold_bn_into_conv1d_same_no_offset_ok(self):
+        # With beta=0 and moving_mean=0 the BN offset is zero, so the 'same'
+        # padding edge issue disappears and folding is safe.
+        model = _build_bn_conv1d(padding='same', zero_offset=True)
+        ref = _clone(model)
+        folded = fold_batch_norms(model, verbose=False)
+        self.assertEqual(len(folded), 1,
+                         'BN with no offset should fold even with same padding')
+        x = np.random.randn(2, 20, 4).astype(np.float32)
+        self._assert_outputs_match(ref, model, x)
+
+    def test_fold_bn_into_dense_no_bias(self):
+        # Dense has no bias; folding only applies when the produced offset is
+        # zero. zero_offset=True (beta=0, mean=0) gives bn_offset=0.
+        model = _build_bn_dense(use_bias=False, zero_offset=True)
+        ref = _clone(model)
+        folded = fold_batch_norms(model, verbose=False)
+        self.assertEqual(len(folded), 1)
+        x = np.random.randn(4, 8).astype(np.float32)
+        self._assert_outputs_match(ref, model, x)
+
+    def test_fold_skipped_for_dense_no_bias_with_offset(self):
+        # No-bias Dense with non-zero BN offset has nowhere to absorb the
+        # constant — fold_batch_norms must skip it.
+        model = _build_bn_dense(use_bias=False)  # default initialisers => non-zero mean
+        ref = _clone(model)
+        folded = fold_batch_norms(model, verbose=False)
+        self.assertEqual(len(folded), 0,
+                         'no-bias Dense with non-zero offset must not be folded')
+        x = np.random.randn(4, 8).astype(np.float32)
+        self._assert_outputs_match(ref, model, x, atol=1e-6)
+
+    def test_fold_skipped_with_parallel_consumers(self):
+        # BN output feeds two Dense layers in parallel — folding must be skipped
+        # because absorbing BN into one consumer would break the other.
+        inp = keras.Input((8,))
+        x = _bn()(inp)
+        a = layers.Dense(4, name='dense_a')(x)
+        b = layers.Dense(4, name='dense_b')(x)
+        out = layers.Concatenate()([a, b])
+        model = keras.Model(inp, out)
+
+        ref = _clone(model)
+        folded = fold_batch_norms(model, verbose=False)
+        self.assertEqual(len(folded), 0,
+                         'BN with multiple consumers must not be folded')
+        # And the model's behaviour must be identical (no weight changes).
+        x_in = np.random.randn(4, 8).astype(np.float32)
+        self._assert_outputs_match(ref, model, x_in, atol=1e-6)
+
+
+class TestK2CDoesNotMutateModel(unittest.TestCase):
+    """k2c must not change the caller's model — folding works on a clone."""
+
+    def _cleanup(self, name):
+        for ext in ('.c', '.h'):
+            p = name + ext
+            if os.path.exists(p):
+                os.remove(p)
+
+    def test_weights_and_outputs_unchanged_after_k2c(self):
+        model = _build_bn_dense()
+        x = np.random.randn(4, 8).astype(np.float32)
+        y_before = model.predict(x, verbose=0)
+        weights_before = [w.copy() for w in model.get_weights()]
+
+        name = 'test___k2c_no_mutate' + str(int(time.time()))
+        try:
+            keras2c_main.k2c(model, name, num_tests=0, verbose=False)
+            weights_after = model.get_weights()
+            self.assertEqual(len(weights_before), len(weights_after))
+            for wb, wa in zip(weights_before, weights_after):
+                self.assertTrue(
+                    np.array_equal(wb, wa),
+                    msg=f'k2c mutated a weight (max abs diff={np.max(np.abs(wb-wa)):.3e})',
+                )
+            y_after = model.predict(x, verbose=0)
+            self.assertTrue(np.array_equal(y_before, y_after),
+                            msg='k2c changed model output')
+        finally:
+            self._cleanup(name)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_pooling_layers.py b/tests/test_pooling_layers.py
index fbe7776..e4ffda0 100644
--- a/tests/test_pooling_layers.py
+++ b/tests/test_pooling_layers.py
@@ -6,14 +6,11 @@
 #!/usr/bin/env python3
 
 import unittest
-import tensorflow.keras as keras
+import keras
 from keras2c import keras2c_main
-import subprocess
 import time
-import os
 from test_core_layers import build_and_run
-import tensorflow as tf
-tf.compat.v1.disable_eager_execution()
+
 
 __author__ = "Rory Conlin"
 __copyright__ = "Copyright 2020, Rory Conlin"
@@ -82,7 +79,7 @@ def test_AveragePooling1D2(self):
         model = keras.models.Model(inputs=a, outputs=b)
         name = 'test___AveragePooling1D2' + str(int(time.time()))
         keras2c_main.k2c(model, name)
-        rcode = build_and_run(name)
+        rcode = build_and_run(name)  # TODO gives an average error of infinity
         self.assertEqual(rcode, 0)
 
     def test_MaxPooling2D1(self):
@@ -142,7 +139,7 @@ def test_AveragePooling2D2(self):
         model = keras.models.Model(inputs=a, outputs=b)
         name = 'test___AveragePooling2D2' + str(int(time.time()))
         keras2c_main.k2c(model, name)
-        rcode = build_and_run(name)
+        rcode = build_and_run(name)  # TODO gives an average error of infinity
         self.assertEqual(rcode, 0)
 
     def test_GlobalAveragePooling1D(self):
diff --git a/tests/test_recurrent_layers.py b/tests/test_recurrent_layers.py
index c07cab2..0a11370 100644
--- a/tests/test_recurrent_layers.py
+++ b/tests/test_recurrent_layers.py
@@ -6,14 +6,11 @@
 #!/usr/bin/env python3
 
 import unittest
-import tensorflow.keras as keras
+import keras
 from keras2c import keras2c_main
-import subprocess
 import time
-import os
 from test_core_layers import build_and_run
-import tensorflow as tf
-tf.compat.v1.disable_eager_execution()
+
 
 __author__ = "Rory Conlin"
 __copyright__ = "Copyright 2020, Rory Conlin"
@@ -152,7 +149,3 @@ def test_GRU3(self):
         keras2c_main.k2c(model, name)
         rcode = build_and_run(name)
         self.assertEqual(rcode, 0)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/test_split_layers.py b/tests/test_split_layers.py
new file mode 100644
index 0000000..a1fd00c
--- /dev/null
+++ b/tests/test_split_layers.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+
+import unittest
+import keras
+from keras2c import keras2c_main
+import time
+import numpy as np
+from test_core_layers import build_and_run
+import tensorflow as tf
+
+
+class TestSplitLayers(unittest.TestCase):
+    """tests for split layers"""
+
+    def test_split_layers(self):
+        for tno in range(10):
+            n_splits = np.random.randint(2, 10)
+            in_dim = np.random.randint(1 * n_splits, 10 * n_splits)
+
+            # Method 1: Random distribution that guarantees validity
+            split_sizes = [1] * n_splits  # Start with minimum size of 1 for each split
+            remaining = in_dim - n_splits  # Distribute the remaining dimensions
+
+            # Randomly distribute the remaining dimensions
+            for _ in range(remaining):
+                idx = np.random.randint(0, n_splits)
+                split_sizes[idx] += 1
+
+            # Convert to list of integers (sometimes numpy types cause issues)
+            split_sizes = [int(s) for s in split_sizes]
+
+            # Create model with these pre-computed split sizes
+            a = keras.layers.Input((in_dim,))
+
+            cumsum = np.cumsum(split_sizes[:-1]).tolist()  # Don't include the last one
+            b = keras.ops.split(a, indices_or_sections=cumsum, axis=1)
+
+            model = keras.models.Model(inputs=a, outputs=b)
+            name = 'test___SplitLayer' + str(int(time.time()))
+            keras2c_main.k2c(model, name)
+            rcode = build_and_run(name)
+            self.assertEqual(rcode, 0)
diff --git a/tests/test_wrappers.py b/tests/test_wrappers.py
index 289518e..2565d71 100644
--- a/tests/test_wrappers.py
+++ b/tests/test_wrappers.py
@@ -6,32 +6,35 @@
 #!/usr/bin/env python3
 
 from test_core_layers import build_and_run
-import numpy as np
-import os
 import time
-import subprocess
 from keras2c import keras2c_main
 import unittest
-import tensorflow.keras as keras
-from tensorflow.python.framework.ops import disable_eager_execution
-disable_eager_execution()
+import keras
 
-__author__ = "Rory Conlin"
-__copyright__ = "Copyright 2020, Rory Conlin"
-__license__ = "MIT"
-__maintainer__ = "Rory Conlin, https://github.com/f0uriest/keras2c"
-__email__ = "wconlin@princeton.edu"
 
+# Original author
+# __author__ = "Rory Conlin"
+# __copyright__ = "Copyright 2020, Rory Conlin"
+# __license__ = "MIT"
+# __maintainer__ = "Rory Conlin, https://github.com/f0uriest/keras2c"
+# __email__ = "wconlin@princeton.edu"
+
+# Modified by
+__author__ = "Anchal Gupta"
+__email__ = "guptaa@fusion.gat.com"
 
 class TestWrappers(unittest.TestCase):
     """tests for layer wrappers"""
 
     def test_Bidirectional1(self):
         model = keras.models.Sequential()
-        model.add(keras.layers.Bidirectional(keras.layers.LSTM(10, return_sequences=True),
-                                             input_shape=(5, 10), merge_mode='concat'))
-        model.add(keras.layers.Bidirectional(keras.layers.LSTM(10, return_sequences=True),
-                                             merge_mode='mul'))
+        model.add(keras.layers.Bidirectional(
+            keras.layers.LSTM(10, return_sequences=True),
+            input_shape=(5, 10), merge_mode='concat')
+        )
+        model.add(keras.layers.Bidirectional(
+            keras.layers.LSTM(10, return_sequences=True), merge_mode='mul')
+        )
         model.add(keras.layers.Dense(5))
         model.build()
         name = 'test___Bidirectional1' + str(int(time.time()))
@@ -42,10 +45,12 @@ def test_Bidirectional1(self):
     def test_Bidirectional2(self):
         model = keras.models.Sequential()
         model.add(keras.layers.Dense(5, input_shape=(5, 10)))
-        model.add(keras.layers.Bidirectional(keras.layers.LSTM(10, return_sequences=True),
-                                             merge_mode='ave'))
-        model.add(keras.layers.Bidirectional(keras.layers.LSTM(10, return_sequences=True),
-                                             merge_mode='concat'))
+        model.add(keras.layers.Bidirectional(
+            keras.layers.LSTM(10, return_sequences=True), merge_mode='ave')
+        )
+        model.add(keras.layers.Bidirectional(
+            keras.layers.LSTM(10, return_sequences=True), merge_mode='concat')
+        )
         model.add(keras.layers.Dense(5))
         model.build()
         name = 'test___Bidirectional2' + str(int(time.time()))
@@ -54,7 +59,6 @@ def test_Bidirectional2(self):
         self.assertEqual(rcode, 0)
 
     def test_TimeDistributed1(self):
-
         inputs = keras.layers.Input(shape=(8, 5))
         outputs = keras.layers.TimeDistributed(
             keras.layers.Dense(7, use_bias=True,))(inputs)