tflite micro integrate repo

2020-12-09 15:06:27 +08:00
parent 200c0ff460
commit d80d0af1a6
406 changed files with 58235 additions and 1908 deletions
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/common.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/common.h
@@ -0,0 +1,956 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_COMMON_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_COMMON_H_
+
+#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#ifdef GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
+#define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#endif
+#endif
+
+#include <functional>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+constexpr int kReverseShift = -1;
+
+inline void GetActivationMinMax(FusedActivationFunctionType ac,
+                                float* output_activation_min,
+                                float* output_activation_max) {
+  switch (ac) {
+    case FusedActivationFunctionType::kNone:
+      *output_activation_min = std::numeric_limits<float>::lowest();
+      *output_activation_max = std::numeric_limits<float>::max();
+      break;
+    case FusedActivationFunctionType::kRelu:
+      *output_activation_min = 0.f;
+      *output_activation_max = std::numeric_limits<float>::max();
+      break;
+    case FusedActivationFunctionType::kRelu1:
+      *output_activation_min = -1.f;
+      *output_activation_max = 1.f;
+      break;
+    case FusedActivationFunctionType::kRelu6:
+      *output_activation_min = 0.f;
+      *output_activation_max = 6.f;
+      break;
+  }
+}
+
+template <typename T>
+inline T ActivationFunctionWithMinMax(T x, T output_activation_min,
+                                      T output_activation_max) {
+  using std::max;
+  using std::min;
+  return min(max(x, output_activation_min), output_activation_max);
+}
+
+// Legacy function, left for compatibility only.
+template <FusedActivationFunctionType Ac>
+float ActivationFunction(float x) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  return ActivationFunctionWithMinMax(x, output_activation_min,
+                                      output_activation_max);
+}
+
+inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size,
+                         const float* bias_data, int array_size,
+                         float* array_data) {
+  // Note: see b/132215220: in May 2019 we thought it would be OK to replace
+  // this with the Eigen one-liner:
+  //   return (array.colwise() + bias).cwiseMin(clamp_max).cwiseMin(clamp_max).
+  // This turned out to severely regress performance: +4ms (i.e. 8%) on
+  // MobileNet v2 / 1.0 / 224. So we keep custom NEON code for now.
+  TFLITE_DCHECK_EQ((array_size % bias_size), 0);
+#ifdef USE_NEON
+  float* array_ptr = array_data;
+  float* array_end_ptr = array_ptr + array_size;
+  const auto clamp_min_vec = vdupq_n_f32(clamp_min);
+  const auto clamp_max_vec = vdupq_n_f32(clamp_max);
+  for (; array_ptr != array_end_ptr; array_ptr += bias_size) {
+    int i = 0;
+    for (; i <= bias_size - 16; i += 16) {
+      auto b0 = vld1q_f32(bias_data + i);
+      auto b1 = vld1q_f32(bias_data + i + 4);
+      auto b2 = vld1q_f32(bias_data + i + 8);
+      auto b3 = vld1q_f32(bias_data + i + 12);
+      auto a0 = vld1q_f32(array_ptr + i);
+      auto a1 = vld1q_f32(array_ptr + i + 4);
+      auto a2 = vld1q_f32(array_ptr + i + 8);
+      auto a3 = vld1q_f32(array_ptr + i + 12);
+      auto x0 = vaddq_f32(a0, b0);
+      auto x1 = vaddq_f32(a1, b1);
+      auto x2 = vaddq_f32(a2, b2);
+      auto x3 = vaddq_f32(a3, b3);
+      x0 = vmaxq_f32(clamp_min_vec, x0);
+      x1 = vmaxq_f32(clamp_min_vec, x1);
+      x2 = vmaxq_f32(clamp_min_vec, x2);
+      x3 = vmaxq_f32(clamp_min_vec, x3);
+      x0 = vminq_f32(clamp_max_vec, x0);
+      x1 = vminq_f32(clamp_max_vec, x1);
+      x2 = vminq_f32(clamp_max_vec, x2);
+      x3 = vminq_f32(clamp_max_vec, x3);
+      vst1q_f32(array_ptr + i, x0);
+      vst1q_f32(array_ptr + i + 4, x1);
+      vst1q_f32(array_ptr + i + 8, x2);
+      vst1q_f32(array_ptr + i + 12, x3);
+    }
+    for (; i <= bias_size - 4; i += 4) {
+      auto b = vld1q_f32(bias_data + i);
+      auto a = vld1q_f32(array_ptr + i);
+      auto x = vaddq_f32(a, b);
+      x = vmaxq_f32(clamp_min_vec, x);
+      x = vminq_f32(clamp_max_vec, x);
+      vst1q_f32(array_ptr + i, x);
+    }
+    for (; i < bias_size; i++) {
+      array_ptr[i] = ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i],
+                                                  clamp_min, clamp_max);
+    }
+  }
+#else  // not NEON
+  for (int array_offset = 0; array_offset < array_size;
+       array_offset += bias_size) {
+    for (int i = 0; i < bias_size; i++) {
+      array_data[array_offset + i] = ActivationFunctionWithMinMax(
+          array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max);
+    }
+  }
+#endif
+}
+
+inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(
+    int32_t x, int32_t quantized_multiplier, int left_shift) {
+  using gemmlowp::RoundingDivideByPOT;
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  return RoundingDivideByPOT(
+      SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
+}
+
+inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
+    int32_t x, int32_t quantized_multiplier, int left_shift) {
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
+                                           quantized_multiplier);
+}
+
+inline int32_t MultiplyByQuantizedMultiplier(int32_t x,
+                                             int32_t quantized_multiplier,
+                                             int shift) {
+  using gemmlowp::RoundingDivideByPOT;
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  int left_shift = shift > 0 ? shift : 0;
+  int right_shift = shift > 0 ? 0 : -shift;
+  return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
+                                 x * (1 << left_shift), quantized_multiplier),
+                             right_shift);
+}
+
+inline int32_t MultiplyByQuantizedMultiplier(int64_t x,
+                                             int32_t quantized_multiplier,
+                                             int shift) {
+  // Inputs:
+  // - quantized_multiplier has fixed point at bit 31
+  // - shift is -31 to +7 (negative for right shift)
+  //
+  // Assumptions: The following input ranges are assumed
+  // - quantize_scale>=0  (the usual range is (1<<30) to (1>>31)-1)
+  // - scaling is chosen so final scaled result fits in int32_t
+  // - input x is in the range -(1<<47) <= x < (1<<47)
+  assert(quantized_multiplier >= 0);
+  assert(shift >= -31 && shift < 8);
+
+  int32_t reduced_multiplier = (quantized_multiplier + (1 << 15)) >> 16;
+  int total_shift = 15 - shift;
+  x = (x * (int64_t)reduced_multiplier) + ((int64_t)1 << (total_shift - 1));
+  int32_t result = x >> total_shift;
+  return result;
+}
+
+template <typename T>
+int CountLeadingZeros(T integer_input) {
+  static_assert(std::is_unsigned<T>::value,
+                "Only unsigned integer types handled.");
+#if defined(__GNUC__)
+  return integer_input ? __builtin_clz(integer_input)
+                       : std::numeric_limits<T>::digits;
+#else
+  if (integer_input == 0) {
+    return std::numeric_limits<T>::digits;
+  }
+
+  const T one_in_leading_positive = static_cast<T>(1)
+                                    << (std::numeric_limits<T>::digits - 1);
+  int leading_zeros = 0;
+  while (integer_input < one_in_leading_positive) {
+    integer_input <<= 1;
+    ++leading_zeros;
+  }
+  return leading_zeros;
+#endif
+}
+
+template <typename T>
+inline int CountLeadingSignBits(T integer_input) {
+  static_assert(std::is_signed<T>::value, "Only signed integer types handled.");
+#if defined(__GNUC__) && !defined(__clang__)
+  return integer_input ? __builtin_clrsb(integer_input)
+                       : std::numeric_limits<T>::digits;
+#else
+  using U = typename std::make_unsigned<T>::type;
+  return integer_input >= 0
+             ? CountLeadingZeros(static_cast<U>(integer_input)) - 1
+         : integer_input != std::numeric_limits<T>::min()
+             ? CountLeadingZeros(2 * static_cast<U>(-integer_input) - 1)
+             : 0;
+#endif
+}
+
+// Use "count leading zeros" helper functions to do a fast Floor(log_2(x)).
+template <typename Integer>
+inline Integer FloorLog2(Integer n) {
+  static_assert(std::is_integral<Integer>::value, "");
+  static_assert(std::is_signed<Integer>::value, "");
+  static_assert(sizeof(Integer) == 4 || sizeof(Integer) == 8, "");
+  TFLITE_CHECK_GT(n, 0);
+  if (sizeof(Integer) == 4) {
+    return 30 - CountLeadingSignBits(n);
+  } else {
+    return 62 - CountLeadingSignBits(n);
+  }
+}
+
+// generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in
+// softmax
+inline void gen_lut(const std::function<double(double)>& func, double min,
+                    double max, int16_t* table, const int num) {
+  // size of table should equal to num + 1
+  // last element only for slope calculation
+  double step = (max - min) / (num - 1);
+  double half_step = step / 2.0;
+  for (int i = 0; i < num - 1; i++) {
+    double sample_val = TfLiteRound(func(min + i * step) * 32768.0);
+    double midpoint_interp_val =
+        TfLiteRound((func(min + (i + 1) * step) * 32768.0 +
+                     TfLiteRound(func(min + i * step) * 32768.0)) /
+                    2.0);
+    double midpoint_val =
+        TfLiteRound(func(min + i * step + half_step) * 32768.0);
+    double midpoint_err = midpoint_interp_val - midpoint_val;
+    double bias = TfLiteRound(midpoint_err / 2.0);
+    table[i] = std::min(std::max(sample_val - bias, -32768.0), 32767.0);
+  }
+  table[num - 1] =
+      std::min(std::max(TfLiteRound(func(max) * 32768.0), -32768.0), 32767.0);
+}
+
+// int16_t func table lookup, e.g., lookup exp() and 1/(1+x) used in softmax
+inline int16_t generic_int16_table_lookup(int16_t value, const int16_t* lut) {
+  // 512 base value, lut[513] only for calculate slope
+  uint16_t index = static_cast<uint16_t>(256 + (value >> 7));
+  assert(index < 512 && "LUT index out of range.");
+  int16_t offset = value & 0x7f;
+
+  // base and slope are Q0.15
+  int16_t base = lut[index];
+  int16_t slope = lut[index + 1] - lut[index];
+
+  // Q0.15 * Q0.7 = Q0.22
+  // Round and convert from Q0.22 to Q0.15
+  int32_t delta = (static_cast<int32_t>(slope) * offset + 64) >> 7;
+
+  // Q0.15 + Q0.15
+  return base + delta;
+}
+
+// Table of sigmoid(i/24) at 0.16 format - 256 elements.
+
+// We use combined sigmoid and tanh look-up table, since
+// tanh(x) = 2*sigmoid(2*x) -1.
+// Both functions are symmetric, so the LUT table is only needed
+// for the absolute value of the input.
+static const uint16_t sigmoid_table_uint16[256] = {
+    32768, 33451, 34133, 34813, 35493, 36169, 36843, 37513, 38180, 38841, 39498,
+    40149, 40794, 41432, 42064, 42688, 43304, 43912, 44511, 45102, 45683, 46255,
+    46817, 47369, 47911, 48443, 48964, 49475, 49975, 50464, 50942, 51409, 51865,
+    52311, 52745, 53169, 53581, 53983, 54374, 54755, 55125, 55485, 55834, 56174,
+    56503, 56823, 57133, 57433, 57724, 58007, 58280, 58544, 58800, 59048, 59288,
+    59519, 59743, 59959, 60168, 60370, 60565, 60753, 60935, 61110, 61279, 61441,
+    61599, 61750, 61896, 62036, 62172, 62302, 62428, 62549, 62666, 62778, 62886,
+    62990, 63090, 63186, 63279, 63368, 63454, 63536, 63615, 63691, 63765, 63835,
+    63903, 63968, 64030, 64090, 64148, 64204, 64257, 64308, 64357, 64405, 64450,
+    64494, 64536, 64576, 64614, 64652, 64687, 64721, 64754, 64786, 64816, 64845,
+    64873, 64900, 64926, 64950, 64974, 64997, 65019, 65039, 65060, 65079, 65097,
+    65115, 65132, 65149, 65164, 65179, 65194, 65208, 65221, 65234, 65246, 65258,
+    65269, 65280, 65291, 65301, 65310, 65319, 65328, 65337, 65345, 65352, 65360,
+    65367, 65374, 65381, 65387, 65393, 65399, 65404, 65410, 65415, 65420, 65425,
+    65429, 65433, 65438, 65442, 65445, 65449, 65453, 65456, 65459, 65462, 65465,
+    65468, 65471, 65474, 65476, 65479, 65481, 65483, 65485, 65488, 65489, 65491,
+    65493, 65495, 65497, 65498, 65500, 65501, 65503, 65504, 65505, 65507, 65508,
+    65509, 65510, 65511, 65512, 65513, 65514, 65515, 65516, 65517, 65517, 65518,
+    65519, 65520, 65520, 65521, 65522, 65522, 65523, 65523, 65524, 65524, 65525,
+    65525, 65526, 65526, 65526, 65527, 65527, 65528, 65528, 65528, 65529, 65529,
+    65529, 65529, 65530, 65530, 65530, 65530, 65531, 65531, 65531, 65531, 65531,
+    65532, 65532, 65532, 65532, 65532, 65532, 65533, 65533, 65533, 65533, 65533,
+    65533, 65533, 65533, 65534, 65534, 65534, 65534, 65534, 65534, 65534, 65534,
+    65534, 65534, 65535};
+
+// TODO(b/77858996): Add these to gemmlowp.
+template <typename IntegerType>
+IntegerType SaturatingAddNonGemmlowp(IntegerType a, IntegerType b) {
+  static_assert(std::is_same<IntegerType, void>::value, "unimplemented");
+  return a;
+}
+
+template <>
+inline std::int32_t SaturatingAddNonGemmlowp(std::int32_t a, std::int32_t b) {
+  std::int64_t a64 = a;
+  std::int64_t b64 = b;
+  std::int64_t sum = a64 + b64;
+  return static_cast<std::int32_t>(std::min(
+      static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()),
+      std::max(
+          static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()),
+          sum)));
+}
+
+template <typename tRawType, int tIntegerBits>
+gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingAddNonGemmlowp(
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> a,
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> b) {
+  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SaturatingAddNonGemmlowp(a.raw(), b.raw()));
+}
+
+template <typename IntegerType>
+IntegerType SaturatingSub(IntegerType a, IntegerType b) {
+  static_assert(std::is_same<IntegerType, void>::value, "unimplemented");
+  return a;
+}
+
+template <>
+inline std::int16_t SaturatingSub(std::int16_t a, std::int16_t b) {
+  std::int32_t a32 = a;
+  std::int32_t b32 = b;
+  std::int32_t diff = a32 - b32;
+  return static_cast<std::int16_t>(
+      std::min(static_cast<int32_t>(32767),
+               std::max(static_cast<int32_t>(-32768), diff)));
+}
+
+template <>
+inline std::int32_t SaturatingSub(std::int32_t a, std::int32_t b) {
+  std::int64_t a64 = a;
+  std::int64_t b64 = b;
+  std::int64_t diff = a64 - b64;
+  return static_cast<std::int32_t>(std::min(
+      static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()),
+      std::max(
+          static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()),
+          diff)));
+}
+
+template <typename tRawType, int tIntegerBits>
+gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingSub(
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> a,
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> b) {
+  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SaturatingSub(a.raw(), b.raw()));
+}
+// End section to be moved to gemmlowp.
+
+template <typename IntegerType>
+IntegerType SaturatingRoundingMultiplyByPOTParam(IntegerType x, int exponent) {
+  if (exponent == 0) {
+    return x;
+  }
+  using ScalarIntegerType =
+      typename gemmlowp::FixedPointRawTypeTraits<IntegerType>::ScalarRawType;
+  const IntegerType min =
+      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::min());
+  const IntegerType max =
+      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::max());
+  const int ScalarIntegerTypeBits = 8 * sizeof(ScalarIntegerType);
+
+  const std::int32_t threshold =
+      ((1 << (ScalarIntegerTypeBits - 1 - exponent)) - 1);
+  const IntegerType positive_mask =
+      gemmlowp::MaskIfGreaterThan(x, gemmlowp::Dup<IntegerType>(threshold));
+  const IntegerType negative_mask =
+      gemmlowp::MaskIfLessThan(x, gemmlowp::Dup<IntegerType>(-threshold));
+
+  IntegerType result = gemmlowp::ShiftLeft(x, exponent);
+  result = gemmlowp::SelectUsingMask(positive_mask, max, result);
+  result = gemmlowp::SelectUsingMask(negative_mask, min, result);
+  return result;
+}
+
+// If we want to leave IntegerBits fixed, then multiplication
+// by a power of two has to be saturating/rounding, not exact anymore.
+template <typename tRawType, int tIntegerBits>
+gemmlowp::FixedPoint<tRawType, tIntegerBits>
+SaturatingRoundingMultiplyByPOTParam(
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> a, int exponent) {
+  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
+}
+
+// Convert int32_t multiplier to int16_t with rounding.
+inline void DownScaleInt32ToInt16Multiplier(int32_t multiplier_int32_t,
+                                            int16_t* multiplier_int16_t) {
+  TFLITE_DCHECK_GE(multiplier_int32_t, 0);
+  static constexpr int32_t kRoundingOffset = 1 << 15;
+  if (multiplier_int32_t >=
+      std::numeric_limits<int32_t>::max() - kRoundingOffset) {
+    *multiplier_int16_t = std::numeric_limits<int16_t>::max();
+    return;
+  }
+  const int32_t result = (multiplier_int32_t + kRoundingOffset) >> 16;
+  TFLITE_DCHECK_LE(result << 16, multiplier_int32_t + kRoundingOffset);
+  TFLITE_DCHECK_GT(result << 16, multiplier_int32_t - kRoundingOffset);
+  *multiplier_int16_t = result;
+  TFLITE_DCHECK_EQ(*multiplier_int16_t, result);
+}
+
+// Minimum output bits to accommodate log of maximum input range.  It actually
+// does not matter if one considers, say, [-64,64] or [-64,64).
+//
+// For example, run this through Octave:
+// [0:127; ...
+//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ...
+//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2))]
+constexpr int min_log_x_output_bits(int input_bits) {
+  return input_bits > 90   ? 7
+         : input_bits > 44 ? 6
+         : input_bits > 21 ? 5
+         : input_bits > 10 ? 4
+         : input_bits > 4  ? 3
+         : input_bits > 1  ? 2
+                           : 1;
+}
+
+// Although currently the name of this function says that it cannot handle
+// values less than 1, in practice it can handle as low as 1/x_max, where
+// x_max is the largest representable input.  In other words, the output range
+// is symmetric.
+template <int OutputIntegerBits, int InputIntegerBits>
+inline gemmlowp::FixedPoint<int32_t, OutputIntegerBits>
+log_x_for_x_greater_than_or_equal_to_1_impl(
+    gemmlowp::FixedPoint<int32_t, InputIntegerBits> input_val) {
+  // assert(__builtin_clz(0u) >= std::numeric_limits<uint32_t>::digits - 1);
+  // assert(__builtin_clz(0u) <= std::numeric_limits<uint32_t>::digits);
+  using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
+  // The reason for accumulating the result with an extra bit of headroom is
+  // that z_pow_2_adj * log_2 might be saturated, and adding num_scaled *
+  // recip_denom will otherwise introduce an error.
+  static constexpr int kAccumIntegerBits = OutputIntegerBits + 1;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32_t, kAccumIntegerBits>;
+
+  const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1488522236, std::log(2.0));
+  const FixedPoint0 sqrt_sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1805811301, std::sqrt(std::sqrt(0.5)));
+  const FixedPoint0 sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1518500250, std::sqrt(0.5));
+  const FixedPoint0 one_quarter =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(FixedPoint0, 536870912, 1.0 / 4.0);
+
+  const FixedPoint0 alpha_n = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 117049297, 11.0 / 240.0 * std::sqrt(std::sqrt(2.0)));
+  const FixedPoint0 alpha_d = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 127690142, 1.0 / 20.0 * std::sqrt(std::sqrt(2.0)));
+  const FixedPoint0 alpha_i = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1057819769,
+      2.0 / std::sqrt(std::sqrt(2.0)) - std::sqrt(std::sqrt(2.0)));
+  const FixedPoint0 alpha_f = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 638450708, 1.0 / 4.0 * std::sqrt(std::sqrt(2.0)));
+
+  const FixedPointAccum shifted_quarter =
+      gemmlowp::Rescale<kAccumIntegerBits>(one_quarter);
+
+  // Reinterpret the input value as Q0.31, because we will figure out the
+  // required shift "ourselves" instead of using, say, Rescale.
+  FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw());
+  // z_a_pow_2 = input_integer_bits - z_a_headroom;
+  int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32_t>(z_a.raw()));
+  FixedPoint0 r_a_tmp =
+      SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1));
+  const int32_t r_a_raw =
+      SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1);
+  // z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25);
+  // z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25,
+  //                   InputIntegerBits - z_b_headroom - 0.25);
+  const FixedPointAccum z_a_pow_2_adj = SaturatingAddNonGemmlowp(
+      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
+          InputIntegerBits - z_a_headroom_plus_1, 31 - kAccumIntegerBits)),
+      shifted_quarter);
+
+  // z_b is treated like z_a, but premultiplying by sqrt(0.5).
+  FixedPoint0 z_b = z_a * sqrt_half;
+  int z_b_headroom = CountLeadingZeros(static_cast<uint32_t>(z_b.raw())) - 1;
+  const int32_t r_b_raw =
+      SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom);
+  const FixedPointAccum z_b_pow_2_adj = SaturatingSub(
+      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
+          InputIntegerBits - z_b_headroom, 31 - kAccumIntegerBits)),
+      shifted_quarter);
+
+  const FixedPoint0 r = FixedPoint0::FromRaw(std::min(r_a_raw, r_b_raw));
+  const FixedPointAccum z_pow_2_adj = FixedPointAccum::FromRaw(
+      std::max(z_a_pow_2_adj.raw(), z_b_pow_2_adj.raw()));
+
+  const FixedPoint0 p = gemmlowp::RoundingHalfSum(r, sqrt_sqrt_half);
+  FixedPoint0 q = r - sqrt_sqrt_half;
+  q = q + q;
+
+  const FixedPoint0 common_sq = q * q;
+  const FixedPoint0 num = q * r + q * common_sq * alpha_n;
+  const FixedPoint0 denom_minus_one_0 =
+      p * (alpha_i + q + alpha_d * common_sq) + alpha_f * q;
+  const FixedPoint0 recip_denom =
+      one_over_one_plus_x_for_x_in_0_1(denom_minus_one_0);
+
+  const FixedPointAccum num_scaled = gemmlowp::Rescale<kAccumIntegerBits>(num);
+  return gemmlowp::Rescale<OutputIntegerBits>(z_pow_2_adj * log_2 +
+                                              num_scaled * recip_denom);
+}
+
+template <int OutputIntegerBits, int InputIntegerBits>
+inline gemmlowp::FixedPoint<int32_t, OutputIntegerBits>
+log_x_for_x_greater_than_or_equal_to_1(
+    gemmlowp::FixedPoint<int32_t, InputIntegerBits> input_val) {
+  static_assert(
+      OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits),
+      "Output integer bits must be sufficient to accommodate logs of inputs.");
+  return log_x_for_x_greater_than_or_equal_to_1_impl<OutputIntegerBits,
+                                                     InputIntegerBits>(
+      input_val);
+}
+
+inline int32_t GetReciprocal(int32_t x, int x_integer_digits,
+                             int* num_bits_over_unit) {
+  int headroom_plus_one = CountLeadingZeros(static_cast<uint32_t>(x));
+  // This is the number of bits to the left of the binary point above 1.0.
+  // Consider x=1.25.  In that case shifted_scale=0.8 and
+  // no later adjustment will be needed.
+  *num_bits_over_unit = x_integer_digits - headroom_plus_one;
+  const int32_t shifted_sum_minus_one =
+      static_cast<int32_t>((static_cast<uint32_t>(x) << headroom_plus_one) -
+                           (static_cast<uint32_t>(1) << 31));
+
+  gemmlowp::FixedPoint<int32_t, 0> shifted_scale =
+      gemmlowp::one_over_one_plus_x_for_x_in_0_1(
+          gemmlowp::FixedPoint<int32_t, 0>::FromRaw(shifted_sum_minus_one));
+  return shifted_scale.raw();
+}
+
+inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
+                                             int32_t* output_inv_sqrt,
+                                             int* output_shift) {
+  TFLITE_DCHECK_GE(input, 0);
+  if (input <= 1) {
+    // Handle the input value 1 separately to avoid overflow in that case
+    // in the general computation below (b/143972021). Also handle 0 as if it
+    // were a 1. 0 is an invalid input here (divide by zero) and 1 is a valid
+    // but rare/unrealistic input value. We can expect both to occur in some
+    // incompletely trained models, but probably not in fully trained models.
+    *output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
+    *output_shift = 0;
+    return;
+  }
+  TFLITE_DCHECK_GT(input, 1);
+  *output_shift = 11;
+  while (input >= (1 << 29)) {
+    input /= 4;
+    ++*output_shift;
+  }
+  const unsigned max_left_shift_bits =
+      CountLeadingZeros(static_cast<uint32_t>(input)) - 1;
+  const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
+  const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
+  *output_shift -= left_shift_bit_pairs;
+  input <<= 2 * left_shift_bit_pairs;
+  TFLITE_DCHECK_GE(input, (1 << 27));
+  TFLITE_DCHECK_LT(input, (1 << 29));
+  using gemmlowp::FixedPoint;
+  using gemmlowp::Rescale;
+  using gemmlowp::SaturatingRoundingMultiplyByPOT;
+  // Using 3 integer bits gives us enough room for the internal arithmetic in
+  // this Newton-Raphson iteration.
+  using F3 = FixedPoint<int32_t, 3>;
+  using F0 = FixedPoint<int32_t, 0>;
+  const F3 fixedpoint_input = F3::FromRaw(input >> 1);
+  const F3 fixedpoint_half_input =
+      SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
+  const F3 fixedpoint_half_three =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
+  // Newton-Raphson iteration
+  // Naive unoptimized starting guess: x = 1
+  F3 x = F3::One();
+  // Naive unoptimized number of iterations: 5
+  for (int i = 0; i < 5; i++) {
+    const F3 x3 = Rescale<3>(x * x * x);
+    x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
+  }
+  const F0 fixedpoint_half_sqrt_2 =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
+  x = x * fixedpoint_half_sqrt_2;
+  *output_inv_sqrt = x.raw();
+  if (*output_shift < 0) {
+    *output_inv_sqrt <<= -*output_shift;
+    *output_shift = 0;
+  }
+  // Convert right shift (right is positive) to left shift.
+  *output_shift *= reverse_shift;
+}
+
+// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+// BROADCASTING.
+//
+// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
+// rectangular array of numbers.
+//
+// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
+// However, as Dims<N> is to be deprecated, this class exists as an adaptor
+// to enable simple unoptimized implementations of element-wise broadcasting
+// operations.
+template <int N>
+struct NdArrayDesc {
+  // The "extent" of each dimension. Indices along dimension d must be in the
+  // half-open interval [0, extents[d]).
+  int extents[N];
+
+  // The number of *elements* (not bytes) between consecutive indices of each
+  // dimension.
+  int strides[N];
+};
+
+// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+// BROADCASTING.
+//
+// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
+inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2,
+                            int i3) {
+  TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]);
+  TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]);
+  TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]);
+  TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]);
+  return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] +
+         i3 * desc.strides[3];
+}
+
+inline int SubscriptToIndex(const NdArrayDesc<5>& desc, int indexes[5]) {
+  return indexes[0] * desc.strides[0] + indexes[1] * desc.strides[1] +
+         indexes[2] * desc.strides[2] + indexes[3] * desc.strides[3] +
+         indexes[4] * desc.strides[4];
+}
+
+// Given the dimensions of the operands for an element-wise binary broadcast,
+// adjusts them so that they can be directly iterated over with simple loops.
+// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
+// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr.
+//
+// This function assumes that the two input shapes are compatible up to
+// broadcasting and the shorter one has already been prepended with 1s to be the
+// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64),
+// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that
+// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be
+// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1).
+//
+// When two shapes are compatible up to broadcasting, for each dimension d,
+// the input extents are either equal, or one of them is 1.
+//
+// This function performs the following for each dimension d:
+// - If the extents are equal, then do nothing since the loop that walks over
+//   both of the input arrays is correct.
+// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1
+//   and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows
+//   array0 to be referenced *at any index* in dimension d and still access the
+//   same slice.
+template <int N>
+inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
+                                                const Dims<N>& input1_dims,
+                                                NdArrayDesc<N>* desc0_out,
+                                                NdArrayDesc<N>* desc1_out) {
+  TFLITE_DCHECK(desc0_out != nullptr);
+  TFLITE_DCHECK(desc1_out != nullptr);
+
+  // Copy dims to desc.
+  for (int i = 0; i < N; ++i) {
+    desc0_out->extents[i] = input0_dims.sizes[i];
+    desc0_out->strides[i] = input0_dims.strides[i];
+    desc1_out->extents[i] = input1_dims.sizes[i];
+    desc1_out->strides[i] = input1_dims.strides[i];
+  }
+
+  // Walk over each dimension. If the extents are equal do nothing.
+  // Otherwise, set the desc with extent 1 to have extent equal to the other and
+  // stride 0.
+  for (int i = 0; i < N; ++i) {
+    const int extent0 = ArraySize(input0_dims, i);
+    const int extent1 = ArraySize(input1_dims, i);
+    if (extent0 != extent1) {
+      if (extent0 == 1) {
+        desc0_out->strides[i] = 0;
+        desc0_out->extents[i] = extent1;
+      } else {
+        TFLITE_DCHECK_EQ(extent1, 1);
+        desc1_out->strides[i] = 0;
+        desc1_out->extents[i] = extent0;
+      }
+    }
+  }
+}
+
+// Copies dims to desc, calculating strides.
+template <int N>
+inline void CopyDimsToDesc(const RuntimeShape& input_shape,
+                           NdArrayDesc<N>* desc_out) {
+  int desc_stride = 1;
+  for (int i = N - 1; i >= 0; --i) {
+    desc_out->extents[i] = input_shape.Dims(i);
+    desc_out->strides[i] = desc_stride;
+    desc_stride *= input_shape.Dims(i);
+  }
+}
+
+template <int N>
+inline void NdArrayDescsForElementwiseBroadcast(
+    const RuntimeShape& input0_shape, const RuntimeShape& input1_shape,
+    NdArrayDesc<N>* desc0_out, NdArrayDesc<N>* desc1_out) {
+  TFLITE_DCHECK(desc0_out != nullptr);
+  TFLITE_DCHECK(desc1_out != nullptr);
+
+  auto extended_input0_shape = RuntimeShape::ExtendedShape(N, input0_shape);
+  auto extended_input1_shape = RuntimeShape::ExtendedShape(N, input1_shape);
+
+  // Copy dims to desc, calculating strides.
+  CopyDimsToDesc<N>(extended_input0_shape, desc0_out);
+  CopyDimsToDesc<N>(extended_input1_shape, desc1_out);
+
+  // Walk over each dimension. If the extents are equal do nothing.
+  // Otherwise, set the desc with extent 1 to have extent equal to the other and
+  // stride 0.
+  for (int i = 0; i < N; ++i) {
+    const int extent0 = extended_input0_shape.Dims(i);
+    const int extent1 = extended_input1_shape.Dims(i);
+    if (extent0 != extent1) {
+      if (extent0 == 1) {
+        desc0_out->strides[i] = 0;
+        desc0_out->extents[i] = extent1;
+      } else {
+        TFLITE_DCHECK_EQ(extent1, 1);
+        desc1_out->strides[i] = 0;
+        desc1_out->extents[i] = extent0;
+      }
+    }
+  }
+}
+
+template <int N>
+inline void NdArrayDescsForElementwiseBroadcast(
+    const RuntimeShape& input0_shape, const RuntimeShape& input1_shape,
+    const RuntimeShape& input2_shape, NdArrayDesc<N>* desc0_out,
+    NdArrayDesc<N>* desc1_out, NdArrayDesc<N>* desc2_out) {
+  TFLITE_DCHECK(desc0_out != nullptr);
+  TFLITE_DCHECK(desc1_out != nullptr);
+  TFLITE_DCHECK(desc2_out != nullptr);
+
+  auto extended_input0_shape = RuntimeShape::ExtendedShape(N, input0_shape);
+  auto extended_input1_shape = RuntimeShape::ExtendedShape(N, input1_shape);
+  auto extended_input2_shape = RuntimeShape::ExtendedShape(N, input2_shape);
+
+  // Copy dims to desc, calculating strides.
+  CopyDimsToDesc<N>(extended_input0_shape, desc0_out);
+  CopyDimsToDesc<N>(extended_input1_shape, desc1_out);
+  CopyDimsToDesc<N>(extended_input2_shape, desc2_out);
+
+  // Walk over each dimension. If the extents are equal do nothing.
+  // Otherwise, set the desc with extent 1 to have extent equal to the other and
+  // stride 0.
+  for (int i = 0; i < N; ++i) {
+    const int extent0 = extended_input0_shape.Dims(i);
+    const int extent1 = extended_input1_shape.Dims(i);
+    const int extent2 = extended_input2_shape.Dims(i);
+
+    int extent = extent0;
+    if (extent1 != 1) extent = extent1;
+    if (extent2 != 1) extent = extent2;
+
+    TFLITE_DCHECK(extent0 == 1 || extent0 == extent);
+    TFLITE_DCHECK(extent1 == 1 || extent1 == extent);
+    TFLITE_DCHECK(extent2 == 1 || extent2 == extent);
+
+    if (!(extent0 == extent1 && extent1 == extent2)) {
+      if (extent0 == 1) {
+        desc0_out->strides[i] = 0;
+        desc0_out->extents[i] = extent;
+      }
+      if (extent1 == 1) {
+        desc1_out->strides[i] = 0;
+        desc1_out->extents[i] = extent;
+      }
+      if (extent2 == 1) {
+        desc2_out->strides[i] = 0;
+        desc2_out->extents[i] = extent;
+      }
+    }
+  }
+}
+
+// Detailed implementation of NDOpsHelper, the indexes must be a zero array.
+// This implementation is equivalent to N nested loops. Ex, if N=4, it can be
+// re-writen as:
+// for (int b = 0; b < output.extents[0]; ++b) {
+//   for (int y = 0; y < output.extents[1]; ++y) {
+//     for (int x = 0; x < output.extents[2]; ++x) {
+//       for (int c = 0; c < output.extents[3]; ++c) {
+//           calc({b,y,x,c});
+//       }
+//     }
+//   }
+// }
+template <int N, int DIM, typename Calc>
+typename std::enable_if<DIM != N - 1, void>::type NDOpsHelperImpl(
+    const NdArrayDesc<N>& output, const Calc& calc, int indexes[N]) {
+  for (indexes[DIM] = 0; indexes[DIM] < output.extents[DIM]; ++indexes[DIM]) {
+    NDOpsHelperImpl<N, DIM + 1, Calc>(output, calc, indexes);
+  }
+}
+
+template <int N, int DIM, typename Calc>
+typename std::enable_if<DIM == N - 1, void>::type NDOpsHelperImpl(
+    const NdArrayDesc<N>& output, const Calc& calc, int indexes[N]) {
+  for (indexes[DIM] = 0; indexes[DIM] < output.extents[DIM]; ++indexes[DIM]) {
+    calc(indexes);
+  }
+}
+
+// Execute the calc function in the innermost iteration based on the shape of
+// the output. The calc function should take a single argument of type int[N].
+template <int N, typename Calc>
+inline void NDOpsHelper(const NdArrayDesc<N>& output, const Calc& calc) {
+  int indexes[N] = {0};
+  NDOpsHelperImpl<N, 0, Calc>(output, calc, indexes);
+}
+// Copied from gemmlowp::RoundDown when we dropped direct dependency on
+// gemmlowp.
+//
+// Returns the runtime argument rounded down to the nearest multiple of
+// the fixed Modulus.
+template <unsigned Modulus, typename Integer>
+Integer RoundDown(Integer i) {
+  return i - (i % Modulus);
+}
+
+// Copied from gemmlowp::RoundUp when we dropped direct dependency on
+// gemmlowp.
+//
+// Returns the runtime argument rounded up to the nearest multiple of
+// the fixed Modulus.
+template <unsigned Modulus, typename Integer>
+Integer RoundUp(Integer i) {
+  return RoundDown<Modulus>(i + Modulus - 1);
+}
+
+// Copied from gemmlowp::CeilQuotient when we dropped direct dependency on
+// gemmlowp.
+//
+// Returns the quotient a / b rounded up ('ceil') to the nearest integer.
+template <typename Integer>
+Integer CeilQuotient(Integer a, Integer b) {
+  return (a + b - 1) / b;
+}
+
+// This function is a copy of gemmlowp::HowManyThreads, copied when we dropped
+// the direct dependency of internal/optimized/ on gemmlowp.
+//
+// It computes a reasonable number of threads to use for a GEMM of shape
+// (rows, cols, depth).
+//
+// TODO(b/131910176): get rid of this function by switching each call site
+// to its own more sensible logic for its own workload.
+template <int KernelRows>
+inline int LegacyHowManyThreads(int max_num_threads, int rows, int cols,
+                                int depth) {
+  // Early-exit in the default case where multi-threading is disabled.
+  if (max_num_threads == 1) {
+    return 1;
+  }
+
+  // Ensure that each thread has KernelRows rows to process, if at all possible.
+  int thread_count = std::min(max_num_threads, rows / KernelRows);
+
+  // Limit the number of threads according to the overall size of the problem.
+  if (thread_count > 1) {
+    // Empirically determined value.
+    static constexpr std::uint64_t min_cubic_size_per_thread = 64 * 1024;
+
+    // We can only multiply two out of three sizes without risking overflow
+    const std::uint64_t cubic_size =
+        std::uint64_t(rows) * std::uint64_t(cols) * std::uint64_t(depth);
+
+    thread_count = std::min(
+        thread_count, static_cast<int>(cubic_size / min_cubic_size_per_thread));
+  }
+
+  if (thread_count < 1) {
+    thread_count = 1;
+  }
+
+  assert(thread_count > 0 && thread_count <= max_num_threads);
+  return thread_count;
+}
+
+template <typename T>
+void optimized_ops_preload_l1_stream(const T* ptr) {
+#ifdef __GNUC__
+  // builtin offered by GCC-compatible compilers including clang
+  __builtin_prefetch(ptr, /* 0 means read */ 0, /* 0 means no locality */ 0);
+#else
+  (void)ptr;
+#endif
+}
+
+template <typename T>
+void optimized_ops_preload_l1_keep(const T* ptr) {
+#ifdef __GNUC__
+  // builtin offered by GCC-compatible compilers including clang
+  __builtin_prefetch(ptr, /* 0 means read */ 0, /* 3 means high locality */ 3);
+#else
+  (void)ptr;
+#endif
+}
+
+template <typename T>
+void optimized_ops_prefetch_write_l1_keep(const T* ptr) {
+#ifdef __GNUC__
+  // builtin offered by GCC-compatible compilers including clang
+  __builtin_prefetch(ptr, /* 1 means write */ 1, /* 3 means high locality */ 3);
+#else
+  (void)ptr;
+#endif
+}
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_COMMON_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/compatibility.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/compatibility.h
@@ -0,0 +1,112 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/kernels/op_macros.h"
+
+#ifndef TFLITE_DCHECK
+#define TFLITE_DCHECK(condition) (condition) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_EQ
+#define TFLITE_DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_NE
+#define TFLITE_DCHECK_NE(x, y) ((x) != (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_GE
+#define TFLITE_DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_GT
+#define TFLITE_DCHECK_GT(x, y) ((x) > (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_LE
+#define TFLITE_DCHECK_LE(x, y) ((x) <= (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_LT
+#define TFLITE_DCHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+// TODO(ahentz): Clean up: We should stick to the DCHECK versions.
+#ifndef TFLITE_CHECK
+#define TFLITE_CHECK(condition) (condition) ? (void)0 : TFLITE_ABORT
+#endif
+
+#ifndef TFLITE_CHECK_EQ
+#define TFLITE_CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : TFLITE_ABORT
+#endif
+
+#ifndef TFLITE_CHECK_NE
+#define TFLITE_CHECK_NE(x, y) ((x) != (y)) ? (void)0 : TFLITE_ABORT
+#endif
+
+#ifndef TFLITE_CHECK_GE
+#define TFLITE_CHECK_GE(x, y) ((x) >= (y)) ? (void)0 : TFLITE_ABORT
+#endif
+
+#ifndef TFLITE_CHECK_GT
+#define TFLITE_CHECK_GT(x, y) ((x) > (y)) ? (void)0 : TFLITE_ABORT
+#endif
+
+#ifndef TFLITE_CHECK_LE
+#define TFLITE_CHECK_LE(x, y) ((x) <= (y)) ? (void)0 : TFLITE_ABORT
+#endif
+
+#ifndef TFLITE_CHECK_LT
+#define TFLITE_CHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ABORT
+#endif
+
+#ifndef TF_LITE_STATIC_MEMORY
+// TODO(b/162019032): Consider removing these type-aliases.
+using int8 = std::int8_t;
+using uint8 = std::uint8_t;
+using int16 = std::int16_t;
+using uint16 = std::uint16_t;
+using int32 = std::int32_t;
+using uint32 = std::uint32_t;
+#endif  // !defined(TF_LITE_STATIC_MEMORY)
+
+// TFLITE_DEPRECATED()
+//
+// Duplicated from absl/base/macros.h to avoid pulling in that library.
+// Marks a deprecated class, struct, enum, function, method and variable
+// declarations. The macro argument is used as a custom diagnostic message (e.g.
+// suggestion of a better alternative).
+//
+// Example:
+//
+//   class TFLITE_DEPRECATED("Use Bar instead") Foo {...};
+//   TFLITE_DEPRECATED("Use Baz instead") void Bar() {...}
+//
+// Every usage of a deprecated entity will trigger a warning when compiled with
+// clang's `-Wdeprecated-declarations` option. This option is turned off by
+// default, but the warnings will be reported by clang-tidy.
+#if defined(__clang__) && __cplusplus >= 201103L
+#define TFLITE_DEPRECATED(message) __attribute__((deprecated(message)))
+#endif
+
+#ifndef TFLITE_DEPRECATED
+#define TFLITE_DEPRECATED(message)
+#endif
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/cppmath.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/cppmath.h
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_CPPMATH_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_CPPMATH_H_
+
+#include <cmath>
+
+namespace tflite {
+
+#if defined(TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS) ||                           \
+    (defined(__ANDROID__) && !defined(__NDK_MAJOR__)) || defined(ARDUINO) || \
+    defined(__ZEPHYR__)
+#define TF_LITE_GLOBAL_STD_PREFIX
+#else
+#define TF_LITE_GLOBAL_STD_PREFIX std
+#endif
+
+#define DECLARE_STD_GLOBAL_SWITCH1(tf_name, std_name) \
+  template <class T>                                  \
+  inline T tf_name(const T x) {                       \
+    return TF_LITE_GLOBAL_STD_PREFIX::std_name(x);    \
+  }
+
+DECLARE_STD_GLOBAL_SWITCH1(TfLiteRound, round);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_CPPMATH_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/max.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/max.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
+
+#include <cmath>
+
+namespace tflite {
+
+#if defined(TF_LITE_USE_GLOBAL_MAX) || defined(__ZEPHYR__)
+inline float TfLiteMax(const float& x, const float& y) {
+  return std::max(x, y);
+}
+#else
+template <class T>
+inline T TfLiteMax(const T& x, const T& y) {
+  return std::fmax(x, y);
+}
+#endif
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/min.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/min.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
+
+#include <cmath>
+
+namespace tflite {
+
+#if defined(TF_LITE_USE_GLOBAL_MIN) || defined(__ZEPHYR__)
+inline float TfLiteMin(const float& x, const float& y) {
+  return std::min(x, y);
+}
+#else
+template <class T>
+inline T TfLiteMin(const T& x, const T& y) {
+  return std::fmin(x, y);
+}
+#endif
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/optimized/neon_check.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/optimized/neon_check.h
@@ -0,0 +1,40 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_CHECK_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_CHECK_H_
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#include <arm_neon.h>
+#endif
+
+#if defined __GNUC__ && defined __SSE4_1__ && !defined TF_LITE_DISABLE_X86_NEON
+#define USE_NEON
+#include "NEON_2_SSE.h"
+#endif
+
+// NEON_OR_PORTABLE(SomeFunc, args) calls NeonSomeFunc(args) if USE_NEON is
+// defined, PortableSomeFunc(args) otherwise.
+#ifdef USE_NEON
+// Always use Neon code
+#define NEON_OR_PORTABLE(funcname, ...) Neon##funcname(__VA_ARGS__)
+
+#else
+// No NEON available: Use Portable code
+#define NEON_OR_PORTABLE(funcname, ...) Portable##funcname(__VA_ARGS__)
+
+#endif  // defined(USE_NEON)
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_CHECK_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/quantization_util.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/quantization_util.h
@@ -0,0 +1,292 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
+
+#include <cmath>
+#include <cstdint>
+#include <limits>
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+// Given the min and max values of a float array, return
+// reasonable quantization parameters to use for this array.
+template <typename T>
+QuantizationParams ChooseQuantizationParams(double rmin, double rmax,
+                                            bool narrow_range) {
+  const T qmin = std::numeric_limits<T>::min() + (narrow_range ? 1 : 0);
+  const T qmax = std::numeric_limits<T>::max();
+  const double qmin_double = qmin;
+  const double qmax_double = qmax;
+  // 0 should always be a representable value. Let's assume that the initial
+  // min,max range contains 0.
+  TFLITE_CHECK_LE(rmin, 0.);
+  TFLITE_CHECK_GE(rmax, 0.);
+  if (rmin == rmax) {
+    // Special case where the min,max range is a point. Should be {0}.
+    TFLITE_CHECK_EQ(rmin, 0.);
+    TFLITE_CHECK_EQ(rmax, 0.);
+    QuantizationParams quantization_params;
+    quantization_params.zero_point = 0;
+    quantization_params.scale = 0.;
+    return quantization_params;
+  }
+
+  // General case.
+  //
+  // First determine the scale.
+  const double scale = (rmax - rmin) / (qmax_double - qmin_double);
+
+  // Zero-point computation.
+  // First the initial floating-point computation. The zero-point can be
+  // determined from solving an affine equation for any known pair
+  // (real value, corresponding quantized value).
+  // We know two such pairs: (rmin, qmin) and (rmax, qmax).
+  // The arithmetic error on the zero point computed from either pair
+  // will be roughly machine_epsilon * (sum of absolute values of terms)
+  // so we want to use the variant that adds the smaller terms.
+  const double zero_point_from_min = qmin_double - rmin / scale;
+  const double zero_point_from_max = qmax_double - rmax / scale;
+  const double zero_point_from_min_error =
+      std::abs(qmin_double) + std::abs(rmin / scale);
+  const double zero_point_from_max_error =
+      std::abs(qmax_double) + std::abs(rmax / scale);
+
+  const double zero_point_double =
+      zero_point_from_min_error < zero_point_from_max_error
+          ? zero_point_from_min
+          : zero_point_from_max;
+
+  // Now we need to nudge the zero point to be an integer
+  // (our zero points are integer, and this is motivated by the requirement
+  // to be able to represent the real value "0" exactly as a quantized value,
+  // which is required in multiple places, for example in Im2col with SAME
+  // padding).
+  T nudged_zero_point = 0;
+  if (zero_point_double < qmin_double) {
+    nudged_zero_point = qmin;
+  } else if (zero_point_double > qmax_double) {
+    nudged_zero_point = qmax;
+  } else {
+    nudged_zero_point = static_cast<T>(round(zero_point_double));
+  }
+  // The zero point should always be in the range of quantized value,
+  // [qmin, qmax].
+  TFLITE_CHECK_GE(nudged_zero_point, qmin);
+  TFLITE_CHECK_LE(nudged_zero_point, qmax);
+
+  // Finally, store the result nudged quantization params.
+  QuantizationParams quantization_params;
+  quantization_params.zero_point = nudged_zero_point;
+  quantization_params.scale = scale;
+  return quantization_params;
+}
+
+template <typename T>
+QuantizationParams ChooseQuantizationParams(double rmin, double rmax) {
+  return ChooseQuantizationParams<T>(rmin, rmax, false);
+}
+
+// Converts a floating-point number to an integer. For all inputs x where
+// static_cast<IntOut>(x) is legal according to the C++ standard, the result
+// is identical to that cast (i.e. the result is x with its fractional part
+// truncated whenever that is representable as IntOut).
+//
+// static_cast would cause undefined behavior for the following cases, which
+// have well-defined behavior for this function:
+//
+//  1. If x is NaN, the result is zero.
+//
+//  2. If the truncated form of x is above the representable range of IntOut,
+//     the result is std::numeric_limits<IntOut>::max().
+//
+//  3. If the truncated form of x is below the representable range of IntOut,
+//     the result is std::numeric_limits<IntOut>::min().
+//
+// Note that cases #2 and #3 cover infinities as well as finite numbers.
+//
+// The range of FloatIn must include the range of IntOut, otherwise
+// the results are undefined.
+// TODO(sfeuz): Replace by absl::SafeCast once available.
+template <class IntOut, class FloatIn>
+IntOut SafeCast(FloatIn x) {
+  static_assert(!std::numeric_limits<FloatIn>::is_integer,
+                "FloatIn is integer");
+  static_assert(std::numeric_limits<IntOut>::is_integer,
+                "IntOut is not integer");
+  static_assert(std::numeric_limits<IntOut>::radix == 2, "IntOut is base 2");
+
+  // Special case NaN, for which the logic below doesn't work.
+  if (std::isnan(x)) {
+    return 0;
+  }
+
+  // Negative values all clip to zero for unsigned results.
+  if (!std::numeric_limits<IntOut>::is_signed && x < 0) {
+    return 0;
+  }
+
+  // Handle infinities.
+  if (std::isinf(x)) {
+    return x < 0 ? std::numeric_limits<IntOut>::min()
+                 : std::numeric_limits<IntOut>::max();
+  }
+
+  // Set exp such that x == f * 2^exp for some f with |f| in [0.5, 1.0),
+  // unless x is zero in which case exp == 0. Note that this implies that the
+  // magnitude of x is strictly less than 2^exp.
+  int exp = 0;
+  std::frexp(x, &exp);
+
+  // Let N be the number of non-sign bits in the representation of IntOut. If
+  // the magnitude of x is strictly less than 2^N, the truncated version of x
+  // is representable as IntOut. The only representable integer for which this
+  // is not the case is kMin for signed types (i.e. -2^N), but that is covered
+  // by the fall-through below.
+  if (exp <= std::numeric_limits<IntOut>::digits) {
+    return x;
+  }
+
+  // Handle numbers with magnitude >= 2^N.
+  return x < 0 ? std::numeric_limits<IntOut>::min()
+               : std::numeric_limits<IntOut>::max();
+}
+
+// Decompose a double multiplier into a Q0.31 int32 representation of its
+// significand, and shift representation of NEGATIVE its exponent ---
+// this is intended as a RIGHT-shift.
+//
+// Restricted to the case where the multiplier < 1 (and non-negative).
+void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
+                                         int32_t* quantized_multiplier,
+                                         int* left_shift);
+
+// Decompose a double multiplier into a Q0.31 int32 representation of its
+// significand, and shift representation of its exponent.
+//
+// Restricted to the case where the multiplier > 1.
+void QuantizeMultiplierGreaterThanOne(double double_multiplier,
+                                      int32_t* quantized_multiplier,
+                                      int* left_shift);
+
+// Decompose a double multiplier into a Q0.31 int32 representation of its
+// significand, and shift representation of its exponent.
+//
+// Handles an arbitrary positive multiplier. The 'shift' output-value is
+// basically the 'floating-point exponent' of the multiplier:
+// Negative for a right-shift (when the multiplier is <1), positive for a
+// left-shift (when the multiplier is >1)
+void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
+                        int* shift);
+
+// Splits a double input value into a returned fraction, and a shift value from
+// the exponent, using only bitwise and integer operations to support
+// microcontrollers and other environments without floating-point support.
+//
+// This is designed to be a replacement for how std::frexp() is used within the
+// QuantizeMultiplier() function, and so has a different signature than the
+// standard version, returning a 64-bit integer rather than a double. This
+// result has a maximum value of 1<<31, with the fraction expressed as a
+// proportion of that maximum.
+//
+// std::frexp() returns NaNs and infinities unmodified, but since we're
+// returning integers that can't represent those values, instead we return
+// a shift of std::numeric_limits<int>::max() for all bad numbers, with an int64
+// result of 0 for NaNs, std:numeric_limits<int64_t>::max() for +INFINITY, and
+// std::numeric_limits<int64_t>::min() for -INFINITY. Denormalized inputs will
+// result in return values that end up truncating some bits at the end,
+// reflecting the loss of precision inherent in denormalization.
+int64_t IntegerFrExp(double input, int* shift);
+
+// Converts an integer fraction in the format produced by IntegerFrExp (where
+// 0x40000000 is 1.0) and an exponent shift (between -1022 and +1022) into an
+// IEEE binary64 double format result. The implementation uses only integer and
+// bitwise operators, so no floating point hardware support or emulation is
+// needed. This is here so quantized operations can run non-time-critical
+// preparation calculations on microcontrollers and other platforms without
+// float support.
+double DoubleFromFractionAndShift(int64_t fraction, int shift);
+
+// Performs a multiplication of two numbers in double format, using only integer
+// and bitwise instructions. This is aimed at supporting housekeeping functions
+// for quantized operations on microcontrollers without floating-point hardware.
+double IntegerDoubleMultiply(double a, double b);
+
+// Returns -1 if a is less than b, 0 if a and b are equal, and +1 if a is
+// greater than b. It is implemented using only integer and logical instructions
+// so that it can be easily run on microcontrollers for quantized operations.
+int IntegerDoubleCompare(double a, double b);
+
+// This first creates a multiplier in a double equivalent of
+// Q(input_integer_bits).(31-input_integer_bits) representation, with extra
+// precision in the double's fractional bits.  It then splits the result into
+// significand and exponent.
+void PreprocessSoftmaxScaling(double beta, double input_scale,
+                              int input_integer_bits,
+                              int32_t* quantized_multiplier, int* left_shift);
+// Like PreprocessSoftmaxScaling, but inverse scaling factors also calculated.
+void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
+                                    int input_integer_bits,
+                                    int32_t* quantized_multiplier,
+                                    int* left_shift,
+                                    int32_t* reverse_scaling_divisor,
+                                    int* reverse_scaling_left_shift);
+// Calculate the largest input that will result in a within-bounds intermediate
+// result within MultiplyByQuantizedMultiplierGreaterThanOne.  In other words,
+// it must not overflow before we reduce the value by multiplication by the
+// input multiplier.  The negative radius is used as the minimum difference in
+// Softmax.
+int CalculateInputRadius(int input_integer_bits, int input_left_shift,
+                         int total_signed_bits = 31);
+
+// Nudges a min/max quantization range to ensure zero is zero.
+// Gymnastics with nudged zero point is to ensure that real zero maps to
+// an integer, which is required for e.g. zero-padding in convolutional layers.
+// Outputs nudged_min, nudged_max, nudged_scale.
+void NudgeQuantizationRange(const float min, const float max,
+                            const int quant_min, const int quant_max,
+                            float* nudged_min, float* nudged_max,
+                            float* nudged_scale);
+
+// Fake quantizes (quantizes and dequantizes) input_data using the scale,
+// nudged_min, and nudged_max from NudgeQuantizationRange. This matches the code
+// in TensorFlow's FakeQuantizeWithMinMaxVarsFunctor.
+void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
+                       const float nudged_max, const float* input_data,
+                       float* output_data, const float size);
+
+// If x is approximately a power of two (with any positive or negative
+// exponent), stores that exponent (i.e. log2(x)) in *log2_result, otherwise
+// returns false.
+bool CheckedLog2(const float x, int* log2_result);
+
+// Decomposes an array of double multipliers into a Q0.31 int32 representation
+// of its significand, and shift representation of its exponent.
+//
+// Handles an arbitrary multiplier. The 'shift' output-value is
+// basically the 'floating-point exponent' of the multiplier:
+// Negative for a right-shift (when the multiplier is <1), positive for a
+// left-shift (when the multiplier is >1)
+void QuantizeMultiplierArray(const double* effective_scales, size_t size,
+                             int32_t* effective_scale_significand,
+                             int* effective_shift);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/add.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/add.h
@@ -0,0 +1,454 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+template <typename T>
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const T* input1_data,
+                const RuntimeShape& input2_shape, const T* input2_data,
+                const RuntimeShape& output_shape, T* output_data) {
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] + input2_data[i], params.quantized_activation_min,
+        params.quantized_activation_max);
+  }
+}
+
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const float* input1_data,
+                const RuntimeShape& input2_shape, const float* input2_data,
+                const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; i++) {
+    auto x = input1_data[i] + input2_data[i];
+    output_data[i] = ActivationFunctionWithMinMax(
+        x, params.float_activation_min, params.float_activation_max);
+  }
+}
+
+// Element-wise add that can often be used for inner loop of broadcast add as
+// well as the non-broadcast add.
+
+// This function is used for 8-bit as well as for 16-bit, but the accumulator
+// is 32-bit for both cases. The overflow does not happen due to the
+// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
+template <typename T>
+inline void AddElementwise(int size, const ArithmeticParams& params,
+                           const T* input1_data, const T* input2_data,
+                           T* output_data) {
+  TFLITE_DCHECK_GT(params.input1_offset, -std::numeric_limits<T>::max());
+  TFLITE_DCHECK_GT(params.input2_offset, -std::numeric_limits<T>::max());
+  TFLITE_DCHECK_LT(params.input1_offset, std::numeric_limits<T>::max());
+  TFLITE_DCHECK_LT(params.input2_offset, std::numeric_limits<T>::max());
+
+  for (int i = 0; i < size; ++i) {
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32_t scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sum, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<T>(clamped_output);
+  }
+}
+
+// Scalar-broadcast add that can be used for inner loop of more general
+// broadcast add, so that, for example, scalar-broadcast with batch will still
+// be fast.
+inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
+                               uint8_t input1_data, const uint8_t* input2_data,
+                               uint8_t* output_data) {
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+
+  const int32_t input1_val = params.input1_offset + input1_data;
+  const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+  const int32_t scaled_input1_val =
+      MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          shifted_input1_val, params.input1_multiplier, params.input1_shift);
+  for (int i = 0; i < size; ++i) {
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sum, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<uint8_t>(clamped_output);
+  }
+}
+
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void AddGeneralParamScale(const ArithmeticParams& params,
+                                 const RuntimeShape& input1_shape,
+                                 const int16_t* input1_data,
+                                 const RuntimeShape& input2_shape,
+                                 const int16_t* input2_data,
+                                 const RuntimeShape& output_shape,
+                                 int16_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  int max_value = std::numeric_limits<int16_t>::max();
+
+  TFLITE_DCHECK_GT(params.input1_offset, -max_value);
+  TFLITE_DCHECK_GT(params.input2_offset, -max_value);
+  TFLITE_DCHECK_LT(params.input1_offset, max_value);
+  TFLITE_DCHECK_LT(params.input2_offset, max_value);
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
+                const RuntimeShape& output_shape, int16_t* output_data,
+                bool pot_scale = true) {
+  if (!pot_scale) {
+    AddGeneralParamScale(params, input1_shape, input1_data, input2_shape,
+                         input2_data, output_shape, output_data);
+    return;
+  }
+
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+
+  const int input1_shift = params.input1_shift;
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  const int16_t output_activation_min = params.quantized_activation_min;
+  const int16_t output_activation_max = params.quantized_activation_max;
+
+  TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
+  TFLITE_DCHECK_LE(input1_shift, 0);
+  TFLITE_DCHECK_LE(params.input2_shift, 0);
+  const int16_t* not_shift_input =
+      input1_shift == 0 ? input1_data : input2_data;
+  const int16_t* shift_input = input1_shift == 0 ? input2_data : input1_data;
+  const int input_right_shift =
+      input1_shift == 0 ? -params.input2_shift : -input1_shift;
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+    F0 scaled_input = F0::FromRaw(
+        gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
+    F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
+    const int16_t raw_output = result.raw();
+    const int16_t clamped_output = std::min(
+        output_activation_max, std::max(output_activation_min, raw_output));
+    output_data[i] = clamped_output;
+  }
+}
+
+// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO(benoitjacob): BroadcastAdd is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const float* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const float* input2_data,
+                               const RuntimeShape& output_shape,
+                               float* output_data) {
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
+                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
+                  params.float_activation_min, params.float_activation_max);
+        }
+      }
+    }
+  }
+}
+
+inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const int32_t* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const int32_t* input2_data,
+                               const RuntimeShape& output_shape,
+                               int32_t* output_data) {
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
+                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
+                  params.quantized_activation_min,
+                  params.quantized_activation_max);
+        }
+      }
+    }
+  }
+}
+
+// This function is used for 8-bit as well as for 16-bit, but the accumulator
+// is 32-bit for both cases. The overflow does not happen due to the
+// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
+template <typename T>
+inline void BroadcastAdd4DSlow(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          const int32_t input1_val =
+              params.input1_offset +
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+          const int32_t input2_val =
+              params.input2_offset +
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+          const int32_t shifted_input1_val =
+              input1_val * (1 << params.left_shift);
+          const int32_t shifted_input2_val =
+              input2_val * (1 << params.left_shift);
+          const int32_t scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, params.input1_multiplier,
+                  params.input1_shift);
+          const int32_t scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, params.input2_multiplier,
+                  params.input2_shift);
+          const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+          const int32_t raw_output =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  raw_sum, params.output_multiplier, params.output_shift) +
+              params.output_offset;
+          const int32_t clamped_output =
+              std::min(params.quantized_activation_max,
+                       std::max(params.quantized_activation_min, raw_output));
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              static_cast<T>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const uint8_t* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const uint8_t* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 uint8_t* output_data) {
+  ArithmeticParams switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
+  switched_params.input1_shift = unswitched_params.input2_shift;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+  switched_params.input2_multiplier = unswitched_params.input1_multiplier;
+  switched_params.input2_shift = unswitched_params.input1_shift;
+
+  const bool use_unswitched =
+      unswitched_params.broadcast_category ==
+      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const ArithmeticParams& params =
+      use_unswitched ? unswitched_params : switched_params;
+  const uint8_t* input1_data =
+      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const uint8_t* input2_data =
+      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise add of
+  // sections of the arrays.
+  uint8_t* output_data_ptr = output_data;
+  const uint8_t* input1_data_ptr = input1_data;
+  const uint8_t* input2_data_reset = input2_data;
+  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
+  // between input shapes. y3 for input 1 is always broadcast, and so the
+  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
+  // Put another way,
+  // input1.shape.FlatSize = y0 * y1 * y2 * y4,
+  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  if (y4 > 1) {
+    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
+    // dimension.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const uint8_t* input2_data_ptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          for (int i3 = 0; i3 < y3; ++i3) {
+            AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
+                           output_data_ptr);
+            input2_data_ptr += y4;
+            output_data_ptr += y4;
+          }
+          // We have broadcast y4 of input1 data y3 times, and now move on.
+          input1_data_ptr += y4;
+        }
+      }
+      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
+      input2_data_reset = input2_data_ptr;
+    }
+  } else {
+    // Special case of y4 == 1, in which the innermost loop is a single element
+    // and can be combined with the next (y3) as an inner broadcast.
+    //
+    // Note that this handles the case of pure scalar broadcast when
+    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
+    // broadcast with batch (as y2 > 1).
+    //
+    // NOTE The process is the same as the above general case except simplified
+    // for y4 == 1 and the loop over y3 is contained within the
+    // AddScalarBroadcast function.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const uint8_t* input2_data_ptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
+                             output_data_ptr);
+          input2_data_ptr += y3;
+          output_data_ptr += y3;
+          input1_data_ptr += 1;
+        }
+      }
+      input2_data_reset = input2_data_ptr;
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/arg_min_max.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/arg_min_max.h
@@ -0,0 +1,68 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ARG_MIN_MAX_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ARG_MIN_MAX_H_
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+template <typename T1, typename T2, typename T3, typename Cmp>
+void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
+               const T3* input2_data, const RuntimeShape& output_shape,
+               T2* output_data, const Cmp& cmp) {
+  TFLITE_DCHECK_GT(input1_shape.DimensionsCount(), 0);
+  TFLITE_DCHECK_EQ(input1_shape.DimensionsCount() - 1,
+                   output_shape.DimensionsCount());
+  int axis = input2_data[0];
+  if (axis < 0) {
+    axis += input1_shape.DimensionsCount();
+  }
+  const int axis_size = input1_shape.Dims(axis);
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; ++i) {
+    TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i));
+    outer_size *= input1_shape.Dims(i);
+  }
+
+  int inner_size = 1;
+  const int dims_count = input1_shape.DimensionsCount();
+  for (int i = axis + 1; i < dims_count; ++i) {
+    TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i - 1));
+    inner_size *= input1_shape.Dims(i);
+  }
+  for (int outer = 0; outer < outer_size; ++outer) {
+    for (int inner = 0; inner < inner_size; ++inner) {
+      auto min_max_value = input1_data[outer * axis_size * inner_size + inner];
+      T2 min_max_index = 0;
+      for (int i = 1; i < axis_size; ++i) {
+        const auto& curr_value =
+            input1_data[(outer * axis_size + i) * inner_size + inner];
+        if (cmp(curr_value, min_max_value)) {
+          min_max_value = curr_value;
+          min_max_index = static_cast<T2>(i);
+        }
+      }
+      output_data[outer * inner_size + inner] = min_max_index;
+    }
+  }
+}
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ARG_MIN_MAX_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/binary_function.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/binary_function.h
@@ -0,0 +1,84 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BINARY_FUNCTION_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BINARY_FUNCTION_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+// TODO(ycling): Refactoring. Remove BroadcastLogical and use the more
+// generalized and efficient BroadcastBinaryFunction.
+//
+// Also appears to duplicate MinimumMaximum.
+//
+// R: Result type. T1: Input 1 type. T2: Input 2 type.
+template <typename R, typename T1, typename T2>
+inline void BroadcastBinaryFunction4DSlow(
+    const RuntimeShape& unextended_input1_shape, const T1* input1_data,
+    const RuntimeShape& unextended_input2_shape, const T2* input2_data,
+    const RuntimeShape& unextended_output_shape, R* output_data,
+    R (*func)(T1, T2)) {
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b) {
+    for (int y = 0; y < output_shape.Dims(1); ++y) {
+      for (int x = 0; x < output_shape.Dims(2); ++x) {
+        for (int c = 0; c < output_shape.Dims(3); ++c) {
+          auto out_idx = Offset(output_shape, b, y, x, c);
+          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = func(in1_val, in2_val);
+        }
+      }
+    }
+  }
+}
+
+// R: Result type. T1: Input 1 type. T2: Input 2 type.
+// TODO(renjieliu): Refactor other binary functions to use this one.
+template <typename R, typename T1, typename T2>
+inline void BinaryFunction(const RuntimeShape& input1_shape,
+                           const T1* input1_data,
+                           const RuntimeShape& input2_shape,
+                           const T2* input2_data,
+                           const RuntimeShape& output_shape, R* output_data,
+                           R (*func)(T1, T2)) {
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = func(input1_data[i], input2_data[i]);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BINARY_FUNCTION_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/ceil.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/ceil.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CEIL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CEIL_H_
+
+#include <cmath>
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline void Ceil(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = std::ceil(input_data[i]);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CEIL_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/comparisons.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/comparisons.h
@@ -0,0 +1,334 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_COMPARISONS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_COMPARISONS_H_
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+template <typename T>
+inline bool EqualFn(T lhs, T rhs) {
+  return lhs == rhs;
+}
+
+template <typename T>
+inline bool NotEqualFn(T lhs, T rhs) {
+  return lhs != rhs;
+}
+
+template <typename T>
+inline bool GreaterFn(T lhs, T rhs) {
+  return lhs > rhs;
+}
+template <typename T>
+inline bool GreaterEqualFn(T lhs, T rhs) {
+  return lhs >= rhs;
+}
+template <typename T>
+inline bool LessFn(T lhs, T rhs) {
+  return lhs < rhs;
+}
+template <typename T>
+inline bool LessEqualFn(T lhs, T rhs) {
+  return lhs <= rhs;
+}
+
+inline bool StringRefEqualFn(const StringRef& lhs, const StringRef& rhs) {
+  if (lhs.len != rhs.len) return false;
+  for (int i = 0; i < lhs.len; ++i) {
+    if (lhs.str[i] != rhs.str[i]) return false;
+  }
+  return true;
+}
+
+inline bool StringRefNotEqualFn(const StringRef& lhs, const StringRef& rhs) {
+  return !StringRefEqualFn(lhs, rhs);
+}
+
+template <typename T>
+using ComparisonFn = bool (*)(T, T);
+
+template <typename T, ComparisonFn<T> F>
+inline void ComparisonImpl(
+    const ComparisonParams& op_params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, bool* output_data) {
+  const int64_t flatsize =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int64_t i = 0; i < flatsize; ++i) {
+    output_data[i] = F(input1_data[i], input2_data[i]);
+  }
+}
+
+inline void ComparisonStringImpl(bool (*F)(const StringRef&, const StringRef&),
+                                 const RuntimeShape& input1_shape,
+                                 const TfLiteTensor* input1,
+                                 const RuntimeShape& input2_shape,
+                                 const TfLiteTensor* input2,
+                                 const RuntimeShape& output_shape,
+                                 bool* output_data) {
+  const int64_t flatsize =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int64_t i = 0; i < flatsize; ++i) {
+    const auto lhs = GetString(input1, i);
+    const auto rhs = GetString(input2, i);
+    output_data[i] = F(lhs, rhs);
+  }
+}
+
+template <ComparisonFn<float> F>
+inline void Comparison(const ComparisonParams& op_params,
+                       const RuntimeShape& input1_shape,
+                       const float* input1_data,
+                       const RuntimeShape& input2_shape,
+                       const float* input2_data,
+                       const RuntimeShape& output_shape, bool* output_data) {
+  ComparisonImpl<float, F>(op_params, input1_shape, input1_data, input2_shape,
+                           input2_data, output_shape, output_data);
+}
+
+template <typename T, ComparisonFn<int32_t> F>
+inline void ComparisonWithScaling(
+    const ComparisonParams& op_params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, bool* output_data) {
+  int left_shift = op_params.left_shift;
+  int32_t input1_offset = op_params.input1_offset;
+  int32_t input1_multiplier = op_params.input1_multiplier;
+  int input1_shift = op_params.input1_shift;
+  int32_t input2_offset = op_params.input2_offset;
+  int32_t input2_multiplier = op_params.input2_multiplier;
+  int input2_shift = op_params.input2_shift;
+
+  const int64_t flatsize =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int64_t i = 0; i < flatsize; ++i) {
+    const int32_t input1_val = input1_offset + input1_data[i];
+    const int32_t input2_val = input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << left_shift);
+    const int32_t scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, input1_multiplier, input1_shift);
+    const int32_t scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, input2_multiplier, input2_shift);
+    output_data[i] = F(scaled_input1_val, scaled_input2_val);
+  }
+}
+
+struct BroadcastComparison4DSlowCommon {
+  const RuntimeShape output_shape;
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+};
+
+inline BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess(
+    const RuntimeShape& unextended_input1_shape,
+    const RuntimeShape& unextended_input2_shape,
+    const RuntimeShape& unextended_output_shape) {
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+  return {RuntimeShape::ExtendedShape(4, unextended_output_shape), desc1,
+          desc2};
+}
+
+template <typename T, ComparisonFn<T> F>
+inline void BroadcastComparison4DSlowImpl(
+    const ComparisonParams& op_params,
+    const RuntimeShape& unextended_input1_shape, const T* input1_data,
+    const RuntimeShape& unextended_input2_shape, const T* input2_data,
+    const RuntimeShape& unextended_output_shape, bool* output_data) {
+  const BroadcastComparison4DSlowCommon dims =
+      BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
+                                          unextended_input2_shape,
+                                          unextended_output_shape);
+
+  for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
+    for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
+      for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
+        for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
+          output_data[Offset(dims.output_shape, b, y, x, c)] =
+              F(input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)],
+                input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)]);
+        }
+      }
+    }
+  }
+}
+
+inline void BroadcastComparison4DSlowStringImpl(
+    bool (*F)(const StringRef&, const StringRef&),
+    const RuntimeShape& unextended_input1_shape, const TfLiteTensor* input1,
+    const RuntimeShape& unextended_input2_shape, const TfLiteTensor* input2,
+    const RuntimeShape& unextended_output_shape, bool* output_data) {
+  const BroadcastComparison4DSlowCommon dims =
+      BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
+                                          unextended_input2_shape,
+                                          unextended_output_shape);
+
+  for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
+    for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
+      for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
+        for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
+          const auto lhs =
+              GetString(input1, SubscriptToIndex(dims.desc1, b, y, x, c));
+          const auto rhs =
+              GetString(input2, SubscriptToIndex(dims.desc2, b, y, x, c));
+          output_data[Offset(dims.output_shape, b, y, x, c)] = F(lhs, rhs);
+        }
+      }
+    }
+  }
+}
+
+template <ComparisonFn<float> F>
+inline void BroadcastComparison4DSlow(const ComparisonParams& op_params,
+                                      const RuntimeShape& input1_shape,
+                                      const float* input1_data,
+                                      const RuntimeShape& input2_shape,
+                                      const float* input2_data,
+                                      const RuntimeShape& output_shape,
+                                      bool* output_data) {
+  BroadcastComparison4DSlowImpl<float, F>(op_params, input1_shape, input1_data,
+                                          input2_shape, input2_data,
+                                          output_shape, output_data);
+}
+
+template <typename T, ComparisonFn<int32_t> F>
+inline void BroadcastComparison4DSlowWithScaling(
+    const ComparisonParams& op_params,
+    const RuntimeShape& unextended_input1_shape, const T* input1_data,
+    const RuntimeShape& unextended_input2_shape, const T* input2_data,
+    const RuntimeShape& unextended_output_shape, bool* output_data) {
+  const BroadcastComparison4DSlowCommon dims =
+      BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
+                                          unextended_input2_shape,
+                                          unextended_output_shape);
+
+  int left_shift = op_params.left_shift;
+  int32_t input1_offset = op_params.input1_offset;
+  int32_t input1_multiplier = op_params.input1_multiplier;
+  int input1_shift = op_params.input1_shift;
+  int32_t input2_offset = op_params.input2_offset;
+  int32_t input2_multiplier = op_params.input2_multiplier;
+  int input2_shift = op_params.input2_shift;
+
+  for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
+    for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
+      for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
+        for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
+          const int32_t input1_val =
+              input1_offset +
+              input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)];
+          const int32_t input2_val =
+              input2_offset +
+              input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)];
+          const int32_t shifted_input1_val = input1_val * (1 << left_shift);
+          const int32_t shifted_input2_val = input2_val * (1 << left_shift);
+          const int32_t scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, input1_multiplier, input1_shift);
+          const int32_t scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, input2_multiplier, input2_shift);
+          output_data[Offset(dims.output_shape, b, y, x, c)] =
+              F(scaled_input1_val, scaled_input2_val);
+        }
+      }
+    }
+  }
+}
+
+#define TFLITE_COMPARISON_OP(name)                                             \
+  inline void name(const ComparisonParams& op_params,                          \
+                   const RuntimeShape& input1_shape, const float* input1_data, \
+                   const RuntimeShape& input2_shape, const float* input2_data, \
+                   const RuntimeShape& output_shape, bool* output_data) {      \
+    Comparison<name##Fn>(op_params, input1_shape, input1_data, input2_shape,   \
+                         input2_data, output_shape, output_data);              \
+  }                                                                            \
+  template <typename T>                                                        \
+  inline void name##NoScaling(                                                 \
+      const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
+      const T* input1_data, const RuntimeShape& input2_shape,                  \
+      const T* input2_data, const RuntimeShape& output_shape,                  \
+      bool* output_data) {                                                     \
+    ComparisonImpl<T, name##Fn>(op_params, input1_shape, input1_data,          \
+                                input2_shape, input2_data, output_shape,       \
+                                output_data);                                  \
+  }                                                                            \
+  template <typename T>                                                        \
+  inline void name##WithScaling(                                               \
+      const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
+      const T* input1_data, const RuntimeShape& input2_shape,                  \
+      const T* input2_data, const RuntimeShape& output_shape,                  \
+      bool* output_data) {                                                     \
+    ComparisonWithScaling<T, name##Fn>(op_params, input1_shape, input1_data,   \
+                                       input2_shape, input2_data,              \
+                                       output_shape, output_data);             \
+  }                                                                            \
+  template <typename T>                                                        \
+  inline void Broadcast4DSlow##name##NoScaling(                                \
+      const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
+      const T* input1_data, const RuntimeShape& input2_shape,                  \
+      const T* input2_data, const RuntimeShape& output_shape,                  \
+      bool* output_data) {                                                     \
+    BroadcastComparison4DSlowImpl<T, name##Fn>(                                \
+        op_params, input1_shape, input1_data, input2_shape, input2_data,       \
+        output_shape, output_data);                                            \
+  }                                                                            \
+  inline void Broadcast4DSlow##name(                                           \
+      const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
+      const float* input1_data, const RuntimeShape& input2_shape,              \
+      const float* input2_data, const RuntimeShape& output_shape,              \
+      bool* output_data) {                                                     \
+    BroadcastComparison4DSlow<name##Fn>(op_params, input1_shape, input1_data,  \
+                                        input2_shape, input2_data,             \
+                                        output_shape, output_data);            \
+  }                                                                            \
+  template <typename T>                                                        \
+  inline void Broadcast4DSlow##name##WithScaling(                              \
+      const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
+      const T* input1_data, const RuntimeShape& input2_shape,                  \
+      const T* input2_data, const RuntimeShape& output_shape,                  \
+      bool* output_data) {                                                     \
+    BroadcastComparison4DSlowWithScaling<T, name##Fn>(                         \
+        op_params, input1_shape, input1_data, input2_shape, input2_data,       \
+        output_shape, output_data);                                            \
+  }
+TFLITE_COMPARISON_OP(Equal);
+TFLITE_COMPARISON_OP(NotEqual);
+TFLITE_COMPARISON_OP(Greater);
+TFLITE_COMPARISON_OP(GreaterEqual);
+TFLITE_COMPARISON_OP(Less);
+TFLITE_COMPARISON_OP(LessEqual);
+#undef TFLITE_COMPARISON_OP
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_COMPARISONS_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/concatenation.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/concatenation.h
@@ -0,0 +1,140 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONCATENATION_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONCATENATION_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename Scalar>
+inline void Concatenation(const ConcatenationParams& params,
+                          const RuntimeShape* const* input_shapes,
+                          const Scalar* const* input_data,
+                          const RuntimeShape& output_shape,
+                          Scalar* output_data) {
+  int axis = params.axis;
+  int inputs_count = params.inputs_count;
+  const int concat_dimensions = output_shape.DimensionsCount();
+  TFLITE_DCHECK_LT(axis, concat_dimensions);
+
+  int64_t concat_size = 0;
+  for (int i = 0; i < inputs_count; i++) {
+    TFLITE_DCHECK_EQ(input_shapes[i]->DimensionsCount(), concat_dimensions);
+    for (int j = 0; j < concat_dimensions; j++) {
+      if (j != axis) {
+        MatchingDim(*input_shapes[i], j, output_shape, j);
+      }
+    }
+    concat_size += input_shapes[i]->Dims(axis);
+  }
+  TFLITE_DCHECK_EQ(concat_size, output_shape.Dims(axis));
+  int64_t outer_size = 1;
+  for (int i = 0; i < axis; ++i) {
+    outer_size *= output_shape.Dims(i);
+  }
+  // For all input arrays,
+  // FlatSize() = outer_size * Dims(axis) * base_inner_size;
+  int64_t base_inner_size = 1;
+  for (int i = axis + 1; i < concat_dimensions; ++i) {
+    base_inner_size *= output_shape.Dims(i);
+  }
+
+  Scalar* output_ptr = output_data;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < inputs_count; ++i) {
+      const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
+      const Scalar* input_ptr = input_data[i] + k * copy_size;
+      memcpy(output_ptr, input_ptr, copy_size * sizeof(Scalar));
+      output_ptr += copy_size;
+    }
+  }
+}
+
+// TODO(prabhumk): This is the same as the optimized implementation.
+// TODO(prabhumk): The quantized implementation of concatentation isn't fully
+// quantized as it takes scale as a floating point value. This should be fixed
+// when optimizng this routine further.
+inline void ConcatenationWithScaling(const ConcatenationParams& params,
+                                     const RuntimeShape* const* input_shapes,
+                                     const uint8_t* const* input_data,
+                                     const RuntimeShape& output_shape,
+                                     uint8_t* output_data) {
+  int axis = params.axis;
+  const int32_t* input_zeropoint = params.input_zeropoint;
+  const float* input_scale = params.input_scale;
+  int inputs_count = params.inputs_count;
+  const int32_t output_zeropoint = params.output_zeropoint;
+  const float output_scale = params.output_scale;
+
+  const int concat_dimensions = output_shape.DimensionsCount();
+  TFLITE_DCHECK_LT(axis, concat_dimensions);
+
+  int64_t concat_size = 0;
+  for (int i = 0; i < inputs_count; i++) {
+    TFLITE_DCHECK_EQ(input_shapes[i]->DimensionsCount(), concat_dimensions);
+    for (int j = 0; j < concat_dimensions; j++) {
+      if (j != axis) {
+        MatchingDim(*input_shapes[i], j, output_shape, j);
+      }
+    }
+    concat_size += input_shapes[i]->Dims(axis);
+  }
+  TFLITE_DCHECK_EQ(concat_size, output_shape.Dims(axis));
+  int64_t outer_size = 1;
+  for (int i = 0; i < axis; ++i) {
+    outer_size *= output_shape.Dims(i);
+  }
+  // For all input arrays,
+  // FlatSize() = outer_size * Dims(axis) * base_inner_size;
+  int64_t base_inner_size = 1;
+  for (int i = axis + 1; i < concat_dimensions; ++i) {
+    base_inner_size *= output_shape.Dims(i);
+  }
+
+  const float inverse_output_scale = 1.f / output_scale;
+  uint8_t* output_ptr = output_data;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < inputs_count; ++i) {
+      const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
+      const uint8_t* input_ptr = input_data[i] + k * copy_size;
+      if (input_zeropoint[i] == output_zeropoint &&
+          input_scale[i] == output_scale) {
+        memcpy(output_ptr, input_ptr, copy_size);
+      } else {
+        const float scale = input_scale[i] * inverse_output_scale;
+        const float bias = -input_zeropoint[i] * scale;
+        for (int j = 0; j < copy_size; ++j) {
+          const int32_t value = static_cast<int32_t>(tflite::TfLiteRound(
+                                    input_ptr[j] * scale + bias)) +
+                                output_zeropoint;
+          output_ptr[j] = static_cast<uint8_t>(
+              std::max<int32_t>(std::min<int32_t>(255, value), 0));
+        }
+      }
+      output_ptr += copy_size;
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONCATENATION_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/conv.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/conv.h
@@ -0,0 +1,262 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
+
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+
+
+
+namespace tflite {
+
+namespace reference_ops {
+
+
+inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
+                 const float* input_data, const RuntimeShape& filter_shape,
+                 const float* filter_data, const RuntimeShape& bias_shape,
+                 const float* bias_data, const RuntimeShape& output_shape,
+                 float* output_data, const RuntimeShape& im2col_shape,
+                 float* im2col_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  (void)im2col_data;   // only used in optimized code.
+  (void)im2col_shape;  // only used in optimized code.
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          float total = 0.f;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // If the location is outside the bounds of the input image,
+                // use zero as a default value.
+                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height)) {
+                  float input_value = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  float filter_value =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  total += (input_value * filter_value);
+                }
+              }
+            }
+          }
+          float bias_value = 0.0f;
+          if (bias_data) {
+            bias_value = bias_data[out_channel];
+          }
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              ActivationFunctionWithMinMax(total + bias_value,
+                                           output_activation_min,
+                                           output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
+                 const uint8_t* input_data, const RuntimeShape& filter_shape,
+                 const uint8_t* filter_data, const RuntimeShape& bias_shape,
+                 const int32_t* bias_data, const RuntimeShape& output_shape,
+                 uint8_t* output_data, const RuntimeShape& im2col_shape,
+                 uint8_t* im2col_data, void* cpu_backend_context) {
+  (void)cpu_backend_context;  // only used in optimized code.
+  (void)im2col_data;   // only used in optimized code.
+  (void)im2col_shape;  // only used in optimized code.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // If the location is outside the bounds of the input image,
+                // use zero as a default value.
+                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height)) {
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  acc +=
+                      (filter_val + filter_offset) * (input_val + input_offset);
+                }
+              }
+            }
+          }
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
+                                              output_shift);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<uint8_t>(acc);
+        }
+      }
+    }
+  }
+}
+
+inline void HybridConvPerChannel(
+    const ConvParams& params, float* scaling_factors_ptr,
+    const RuntimeShape& input_shape, const int8_t* input_data,
+    const RuntimeShape& filter_shape, const int8_t* filter_data,
+    const RuntimeShape& bias_shape, const float* bias_data,
+    const RuntimeShape& output_shape, float* output_data,
+    const RuntimeShape& im2col_shape, int8_t* im2col_data,
+    const float* per_channel_scale, int32_t* input_offset) {
+  (void)im2col_data;   // only used in optimized code.
+  (void)im2col_shape;  // only used in optimized code.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // If the location is outside the bounds of the input image,
+                // use zero as a default value.
+                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height)) {
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  acc += filter_val * (input_val - input_offset[batch]);
+                }
+              }
+            }
+          }
+          float acc_float =
+              acc * per_channel_scale[out_channel] * scaling_factors_ptr[batch];
+          if (bias_data) {
+            acc_float += bias_data[out_channel];
+          }
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              ActivationFunctionWithMinMax(acc_float, output_activation_min,
+                                           output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h
@@ -0,0 +1,100 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void DepthwiseConv(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& filter_shape,
+    const float* filter_data, const RuntimeShape& bias_shape,
+    const float* bias_data, const RuntimeShape& output_shape,
+    float* output_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  for (int b = 0; b < batches; ++b) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int ic = 0; ic < input_depth; ++ic) {
+          for (int m = 0; m < depth_multiplier; m++) {
+            const int oc = m + ic * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            float total = 0.f;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // If the location is outside the bounds of the input image,
+                // use zero as a default value.
+                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height)) {
+                  float input_value =
+                      input_data[Offset(input_shape, b, in_y, in_x, ic)];
+                  float filter_value = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, oc)];
+                  total += (input_value * filter_value);
+                }
+              }
+            }
+            float bias_value = 0.0f;
+            if (bias_data) {
+              bias_value = bias_data[oc];
+            }
+            output_data[Offset(output_shape, b, out_y, out_x, oc)] =
+                ActivationFunctionWithMinMax(total + bias_value,
+                                             output_activation_min,
+                                             output_activation_max);
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // end namespace reference_ops
+}  // end namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
@@ -0,0 +1,297 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
+
+#include <algorithm>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+// Used in tests and template parameters to control which version of depthwise
+// convolution is called. Primarily for reference code, and specializations
+// forced in tests.
+enum class DepthwiseConvImplementation {
+  // Run all tests against kUseStandardEntry even if also testing another
+  // kernel, since we need to be sure that the main DepthwiseConv() function in
+  // optimized_ops.h dispatches to a correctly-executing kernel.
+  kNone = 0,                 // The "default" option: use the normal
+                             // DepthwiseConv kernel (entry) function.
+  kUseGenericKernel,         // Forced use of generic kernel.
+  kUseNeon3x3,               // 3x3 kernel that uses NEON when available.
+  kUseNeon3x3DotProduct,     // 3x3 kernel that uses dot-product enabled NEON
+                             // when available.
+  kUseCModel3x3DotProduct,   // 3x3 kernel, reference C model that is intended
+                             // to match overall design NEON code.
+  kUseUnwound3x3DotProduct,  // 3x3 kernel, reference C model with unwound loops
+                             // and some arrays.
+  kUseIntrinsics3x3DotProduct,  // 3x3 kernel using NEON intrinsics.
+};
+
+// Category of depthwise convolution output rounding.
+enum class DepthwiseConvOutputRounding {
+  kNone = 0,      // Invalid: specific method must be specified.
+  kAwayFromZero,  // Original method: exact halves rounded away from zero.
+  kUpward,        // Halves towards +infinity: adds 0.5 before truncate.
+  // This is where a future kNearestEven would be placed.
+};
+
+// Category of depthwise convolution depth multiplication.
+enum class DepthwiseConvDepthMultiplication {
+  kNoMultiplication = 0,  // Depth multiplier = 1.
+  kUnitInputDepth,        // Input depth = 1, output depth = depth multiplier.
+};
+
+namespace reference_ops {
+namespace depthwise_conv {
+
+template <DepthwiseConvOutputRounding output_rounding>
+inline int32_t DepthwiseConvRound(int32_t x, int32_t quantized_multiplier,
+                                  int shift) {
+  TFLITE_DCHECK_NE(output_rounding, DepthwiseConvOutputRounding::kNone);
+  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
+}
+
+template <>
+inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
+    int32_t x, int32_t quantized_multiplier, int shift) {
+  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
+}
+
+template <>
+inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
+    int32_t x, int32_t quantized_multiplier, int shift) {
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  const int left_shift = shift > 0 ? shift : 0;
+  const int right_shift = shift > 0 ? 0 : -shift;
+  const int rounding_offset = right_shift > 0 ? 1 << (right_shift - 1) : 0;
+  return (SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
+                                            quantized_multiplier) +
+          rounding_offset) >>
+         right_shift;
+}
+
+template <DepthwiseConvOutputRounding output_rounding>
+struct DepthwiseConvBasicKernel {
+  static inline void Run(
+      const DepthwiseParams& params, const RuntimeShape& input_shape,
+      const uint8_t* input_data, const RuntimeShape& filter_shape,
+      const uint8_t* filter_data, const RuntimeShape& bias_shape,
+      const int32_t* bias_data, const RuntimeShape& output_shape,
+      uint8_t* output_data) {
+    const int stride_width = params.stride_width;
+    const int stride_height = params.stride_height;
+    const int dilation_width_factor = params.dilation_width_factor;
+    const int dilation_height_factor = params.dilation_height_factor;
+    const int pad_width = params.padding_values.width;
+    const int pad_height = params.padding_values.height;
+    const int depth_multiplier = params.depth_multiplier;
+    const int32_t output_activation_min = params.quantized_activation_min;
+    const int32_t output_activation_max = params.quantized_activation_max;
+    const int32_t input_offset = params.input_offset;
+    const int32_t filter_offset = params.weights_offset;
+    const int32_t output_offset = params.output_offset;
+    const int32_t output_multiplier = params.output_multiplier;
+    const int output_shift = params.output_shift;
+    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+    TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+    const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+    const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+    const int input_height = input_shape.Dims(1);
+    const int input_width = input_shape.Dims(2);
+    const int input_depth = input_shape.Dims(3);
+    const int filter_height = filter_shape.Dims(1);
+    const int filter_width = filter_shape.Dims(2);
+    const int output_height = output_shape.Dims(1);
+    const int output_width = output_shape.Dims(2);
+    TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+    for (int b = 0; b < batches; ++b) {
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          for (int ic = 0; ic < input_depth; ++ic) {
+            for (int m = 0; m < depth_multiplier; m++) {
+              const int oc = m + ic * depth_multiplier;
+              const int in_x_origin = (out_x * stride_width) - pad_width;
+              const int in_y_origin = (out_y * stride_height) - pad_height;
+              int32_t acc = 0;
+              for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+                for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                  const int in_x =
+                      in_x_origin + dilation_width_factor * filter_x;
+                  const int in_y =
+                      in_y_origin + dilation_height_factor * filter_y;
+                  // If the location is outside the bounds of the input image,
+                  // use zero as a default value.
+                  if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                      (in_y < input_height)) {
+                    int32_t input_val =
+                        input_data[Offset(input_shape, b, in_y, in_x, ic)];
+                    int32_t filter_val = filter_data[Offset(
+                        filter_shape, 0, filter_y, filter_x, oc)];
+                    acc += (filter_val + filter_offset) *
+                           (input_val + input_offset);
+                  }
+                }
+              }
+              if (bias_data) {
+                acc += bias_data[oc];
+              }
+              acc = DepthwiseConvRound<output_rounding>(acc, output_multiplier,
+                                                        output_shift);
+              acc += output_offset;
+              acc = std::max(acc, output_activation_min);
+              acc = std::min(acc, output_activation_max);
+              output_data[Offset(output_shape, b, out_y, out_x, oc)] =
+                  static_cast<uint8_t>(acc);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // TODO(b/148596273): Reconcile reference versions, perhaps with common
+  // MultiplyByQuantizedMultiplier or DepthwiseConvRound function.
+  static inline void RunPerChannel(
+      const DepthwiseParams& params, const RuntimeShape& input_shape,
+      const int8_t* input_data, const RuntimeShape& filter_shape,
+      const int8_t* filter_data, const RuntimeShape& bias_shape,
+      const int32_t* bias_data, const RuntimeShape& output_shape,
+      int8_t* output_data) {
+    // Get parameters.
+    // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
+    const int stride_width = params.stride_width;
+    const int stride_height = params.stride_height;
+    const int dilation_width_factor = params.dilation_width_factor;
+    const int dilation_height_factor = params.dilation_height_factor;
+    const int pad_width = params.padding_values.width;
+    const int pad_height = params.padding_values.height;
+    const int depth_multiplier = params.depth_multiplier;
+    const int32_t input_offset = params.input_offset;
+    const int32_t output_offset = params.output_offset;
+    const int32_t output_activation_min = params.quantized_activation_min;
+    const int32_t output_activation_max = params.quantized_activation_max;
+    const int32_t* output_multiplier = params.output_multiplier_per_channel;
+    const int32_t* output_shift = params.output_shift_per_channel;
+
+    // Check dimensions of the tensors.
+    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+    TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+    const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+    const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+    const int input_height = input_shape.Dims(1);
+    const int input_width = input_shape.Dims(2);
+    const int input_depth = input_shape.Dims(3);
+    const int filter_height = filter_shape.Dims(1);
+    const int filter_width = filter_shape.Dims(2);
+    const int output_height = output_shape.Dims(1);
+    const int output_width = output_shape.Dims(2);
+    TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+    for (int batch = 0; batch < batches; ++batch) {
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+            for (int m = 0; m < depth_multiplier; ++m) {
+              const int output_channel = m + in_channel * depth_multiplier;
+              const int in_x_origin = (out_x * stride_width) - pad_width;
+              const int in_y_origin = (out_y * stride_height) - pad_height;
+              int32_t acc = 0;
+              for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+                for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                  const int in_x =
+                      in_x_origin + dilation_width_factor * filter_x;
+                  const int in_y =
+                      in_y_origin + dilation_height_factor * filter_y;
+                  // Zero padding by omitting the areas outside the image.
+                  const bool is_point_inside_image =
+                      (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                      (in_y < input_height);
+                  if (is_point_inside_image) {
+                    int32_t input_val = input_data[Offset(
+                        input_shape, batch, in_y, in_x, in_channel)];
+                    int32_t filter_val = filter_data[Offset(
+                        filter_shape, 0, filter_y, filter_x, output_channel)];
+                    // Accumulate with 32 bits accumulator.
+                    // In the nudging process during model quantization, we
+                    // force real value of 0.0 be represented by a quantized
+                    // value. This guarantees that the input_offset is a int8_t,
+                    // even though it is represented using int32_t. int32_t +=
+                    // int8_t
+                    // * (int8_t - int8_t) so the highest value we can get from
+                    // each accumulation is [-127, 127] * ([-128, 127] -
+                    // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                    // = 14.98, which means we can accumulate at least 2^16
+                    // multiplications without overflow. The accumulator is
+                    // applied to a filter so the accumulation logic will hold
+                    // as long as the filter size (filter_y * filter_x *
+                    // in_channel) does not exceed 2^16, which is the case in
+                    // all the models we have seen so far.
+                    acc += filter_val * (input_val + input_offset);
+                  }
+                }
+              }
+              if (bias_data) {
+                acc += bias_data[output_channel];
+              }
+              acc = DepthwiseConvRound<output_rounding>(
+                  acc, output_multiplier[output_channel],
+                  output_shift[output_channel]);
+              acc += output_offset;
+              acc = std::max(acc, output_activation_min);
+              acc = std::min(acc, output_activation_max);
+              output_data[Offset(output_shape, batch, out_y, out_x,
+                                 output_channel)] = static_cast<int8_t>(acc);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace depthwise_conv
+
+inline void DepthwiseConv(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data) {
+  return depthwise_conv::DepthwiseConvBasicKernel<
+      DepthwiseConvOutputRounding::kAwayFromZero>::Run(params, input_shape,
+                                                       input_data, filter_shape,
+                                                       filter_data, bias_shape,
+                                                       bias_data, output_shape,
+                                                       output_data);
+}
+
+}  // namespace reference_ops
+}  // end namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/dequantize.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/dequantize.h
@@ -0,0 +1,78 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEQUANTIZE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEQUANTIZE_H_
+
+#include <limits.h>
+
+#include <vector>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+// Dequantizes into a float without rounding.
+template <typename InputT, typename OutputT>
+inline void Dequantize(const tflite::DequantizationParams& op_params,
+                       const RuntimeShape& input_shape,
+                       const InputT* input_data,
+                       const RuntimeShape& output_shape, OutputT* output_data) {
+  int32_t zero_point = op_params.zero_point;
+  const double scale = op_params.scale;
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    const int32_t val = input_data[i];
+    const OutputT result = static_cast<OutputT>(scale * (val - zero_point));
+    output_data[i] = result;
+  }
+}
+
+// Dequantizes per-channel quantized tensor to float.
+template <typename T>
+inline void PerChannelDequantize(
+    const tflite::PerChannelDequantizationParams& op_params,
+    const RuntimeShape& input_shape, const T* input_data,
+    const RuntimeShape& output_shape, float* output_data) {
+  // Ensure flat size is same.
+  MatchingFlatSize(input_shape, output_shape);
+
+  const int32_t* zero_point = op_params.zero_point;
+  const float* scale = op_params.scale;
+  const int32_t quantized_dimension = op_params.quantized_dimension;
+  const int32_t num_dims = input_shape.DimensionsCount();
+  const int32_t* dims_data = input_shape.DimsData();
+  std::vector<int> current_dim(num_dims, 0);
+
+  do {
+    size_t offset =
+        ReducedOutputOffset(num_dims, reinterpret_cast<const int*>(dims_data),
+                            current_dim.data(), 0, nullptr);
+    const int channel = current_dim[quantized_dimension];
+    const int32_t val = input_data[offset];
+    const float result =
+        static_cast<float>(scale[channel] * (val - zero_point[channel]));
+    output_data[offset] = result;
+  } while (NextIndex(num_dims, reinterpret_cast<const int*>(dims_data),
+                     current_dim.data()));
+}
+
+}  // namespace reference_ops
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEQUANTIZE_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/floor.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/floor.h
@@ -0,0 +1,39 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_H_
+
+#include <cmath>
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline void Floor(const RuntimeShape& input_shape, const float* input_data,
+                  const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    int offset = i;
+    output_data[offset] = std::floor(input_data[offset]);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FLOOR_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/fully_connected.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/fully_connected.h
@@ -0,0 +1,320 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& weights_shape,
+    const float* weights_data, const RuntimeShape& bias_shape,
+    const float* bias_data, const RuntimeShape& output_shape,
+    float* output_data) {
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dims_count = output_shape.DimensionsCount();
+  const int weights_dims_count = weights_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
+  const int output_depth = MatchingDim(weights_shape, weights_dims_count - 2,
+                                       output_shape, output_dims_count - 1);
+  const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      float total = 0.f;
+      for (int d = 0; d < accum_depth; ++d) {
+        total += input_data[b * accum_depth + d] *
+                 weights_data[out_c * accum_depth + d];
+      }
+      float bias_value = 0.0f;
+      if (bias_data) {
+        bias_value = bias_data[out_c];
+      }
+      output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax(
+          total + bias_value, output_activation_min, output_activation_max);
+    }
+  }
+}
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data) {
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
+                                       output_shape, output_dim_count - 1);
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      int32_t acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+      acc += output_offset;
+      acc = std::max(acc, output_activation_min);
+      acc = std::min(acc, output_activation_max);
+      output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
+    }
+  }
+}
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data) {
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(output_offset, 0);
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
+                                       output_shape, output_dim_count - 1);
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32_t accum = bias_data[out_c];
+      // Accumulation loop.
+      for (int d = 0; d < accum_depth; ++d) {
+        int16_t input_val = input_data[b * accum_depth + d] + input_offset;
+        int16_t filter_val =
+            filter_data[out_c * accum_depth + d] + filter_offset;
+        accum += filter_val * input_val;
+      }
+      // Down-scale the final int32_t accumulator to the scale used by our
+      // (16-bit, typically 3 integer bits) fixed-point format. The quantized
+      // multiplier and shift here have been pre-computed offline
+      // (e.g. by toco).
+      accum =
+          MultiplyByQuantizedMultiplier(accum, output_multiplier, output_shift);
+      // Saturate, cast to int16_t, and store to output array.
+      accum = std::max(accum, output_activation_min - output_offset);
+      accum = std::min(accum, output_activation_max - output_offset);
+      accum += output_offset;
+      output_data[out_c + output_depth * b] = accum;
+    }
+  }
+}
+
+inline void ShuffledFullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& weights_shape,
+    const uint8_t* shuffled_weights_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data, uint8_t* shuffled_input_workspace_data) {
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = MatchingDim(weights_shape, weights_dim_count - 2,
+                                       output_shape, output_dim_count - 1);
+  const int accum_depth = weights_shape.Dims(weights_dim_count - 1);
+  TFLITE_DCHECK((accum_depth % 16) == 0);
+  TFLITE_DCHECK((output_depth % 4) == 0);
+
+  // Shuffling and xoring of input activations into the workspace buffer
+  uint8_t* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
+  if (batches == 1) {
+    for (int i = 0; i < accum_depth; i++) {
+      shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
+    }
+  } else if (batches == 4) {
+    for (int c = 0; c < accum_depth; c += 16) {
+      for (int b = 0; b < 4; b++) {
+        const uint8_t* src_data_ptr = input_data + b * accum_depth + c;
+        for (int j = 0; j < 16; j++) {
+          uint8_t src_val = *src_data_ptr++;
+          // Flip the sign bit, so that the kernel will only need to
+          // reinterpret these uint8_t values as int8_t, getting for free the
+          // subtraction of the zero_point value 128.
+          uint8_t dst_val = src_val ^ 0x80;
+          *shuffled_input_workspace_ptr++ = dst_val;
+        }
+      }
+    }
+  } else {
+    TFLITE_DCHECK(false);
+    return;
+  }
+
+  // Actual computation
+  if (batches == 1) {
+    int16_t* output_ptr = output_data;
+    // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+    // so that just reinterpreting them as int8_t values is equivalent to
+    // subtracting 128 from them, thus implementing for free the subtraction of
+    // the zero_point value 128.
+    const int8_t* shuffled_weights_ptr =
+        reinterpret_cast<const int8_t*>(shuffled_weights_data);
+    // Likewise, we preshuffled and pre-xored the input data above.
+    const int8_t* shuffled_input_data =
+        reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
+    for (int c = 0; c < output_depth; c += 4) {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32_t accum[4] = {0};
+      // Accumulation loop.
+      for (int d = 0; d < accum_depth; d += 16) {
+        for (int i = 0; i < 4; i++) {
+          for (int j = 0; j < 16; j++) {
+            int8_t input_val = shuffled_input_data[d + j];
+            int8_t weights_val = *shuffled_weights_ptr++;
+            accum[i] += weights_val * input_val;
+          }
+        }
+      }
+      for (int i = 0; i < 4; i++) {
+        // Add bias value
+        int32_t acc = accum[i] + bias_data[c + i];
+        // Down-scale the final int32_t accumulator to the scale used by our
+        // (16-bit, typically 3 integer bits) fixed-point format. The quantized
+        // multiplier and shift here have been pre-computed offline
+        // (e.g. by toco).
+        acc =
+            MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+        // Saturate, cast to int16_t, and store to output array.
+        acc = std::max(acc, output_activation_min);
+        acc = std::min(acc, output_activation_max);
+        output_ptr[c + i] = acc;
+      }
+    }
+  } else if (batches == 4) {
+    int16_t* output_ptr = output_data;
+    // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+    // so that just reinterpreting them as int8_t values is equivalent to
+    // subtracting 128 from them, thus implementing for free the subtraction of
+    // the zero_point value 128.
+    const int8_t* shuffled_weights_ptr =
+        reinterpret_cast<const int8_t*>(shuffled_weights_data);
+    // Likewise, we preshuffled and pre-xored the input data above.
+    const int8_t* shuffled_input_data =
+        reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
+    for (int c = 0; c < output_depth; c += 4) {
+      const int8_t* shuffled_input_ptr = shuffled_input_data;
+      // Accumulation loop.
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32_t accum[4][4];
+      for (int i = 0; i < 4; i++) {
+        for (int b = 0; b < 4; b++) {
+          accum[i][b] = 0;
+        }
+      }
+      for (int d = 0; d < accum_depth; d += 16) {
+        for (int i = 0; i < 4; i++) {
+          for (int b = 0; b < 4; b++) {
+            for (int j = 0; j < 16; j++) {
+              int8_t input_val = shuffled_input_ptr[16 * b + j];
+              int8_t weights_val = shuffled_weights_ptr[16 * i + j];
+              accum[i][b] += weights_val * input_val;
+            }
+          }
+        }
+        shuffled_input_ptr += 64;
+        shuffled_weights_ptr += 64;
+      }
+      for (int i = 0; i < 4; i++) {
+        for (int b = 0; b < 4; b++) {
+          // Add bias value
+          int32_t acc = accum[i][b] + bias_data[c + i];
+          // Down-scale the final int32_t accumulator to the scale used by our
+          // (16-bit, typically 3 integer bits) fixed-point format. The
+          // quantized multiplier and shift here have been pre-computed offline
+          // (e.g. by toco).
+          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
+                                              output_shift);
+          // Saturate, cast to int16_t, and store to output array.
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_ptr[b * output_depth + c + i] = acc;
+        }
+      }
+    }
+  } else {
+    TFLITE_DCHECK(false);
+    return;
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/hard_swish.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/hard_swish.h
@@ -0,0 +1,166 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ACTIVATIONS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ACTIVATIONS_H_
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline int16_t SaturatingLeftShift(int16_t value, int amount) {
+  int32_t result = static_cast<int32_t>(value) * (1 << amount);
+  result = std::min<int32_t>(result, std::numeric_limits<int16_t>::max());
+  result = std::max<int32_t>(result, std::numeric_limits<int16_t>::min());
+  return result;
+}
+
+// Similar to ARM instruction SQDMULH.
+// Similar to gemmlowp::SaturatingRoundingDoublingHighMul except
+// rounding to zero instead of to nearest (SQRDMULH).
+inline std::int16_t SaturatingDoublingHighMul(std::int16_t a, std::int16_t b) {
+  bool overflow = a == b && a == std::numeric_limits<std::int16_t>::min();
+  std::int32_t a_32(a);
+  std::int32_t b_32(b);
+  std::int32_t ab_32 = a_32 * b_32;
+  std::int16_t ab_x2_high16 = static_cast<std::int16_t>((ab_32) / (1 << 15));
+  return overflow ? std::numeric_limits<std::int16_t>::max() : ab_x2_high16;
+}
+
+template <typename T>
+inline void HardSwish(const RuntimeShape& input_shape, const T* input_data,
+                      const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("ReferenceHardSwish/Float");
+  auto matching_size = MatchingFlatSize(input_shape, output_shape);
+  const T* in_end = input_data + matching_size;
+  for (; input_data < in_end; input_data++, output_data++) {
+    const float in = *input_data;
+    *output_data =
+        in * std::min(static_cast<T>(6), std::max(static_cast<T>(0), in + 3)) /
+        6;
+  }
+}
+
+template <typename T>
+inline void HardSwish(const HardSwishParams& params,
+                      const RuntimeShape& input_shape, const T* input_data,
+                      const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("ReferenceHardSwish/Quantized");
+
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    const int16_t input_value = input_data[i] - params.input_zero_point;
+    // Left-shift as much as we can without overflow/saturation to put
+    // significant bits in the high bits of our 16-bit fixedpoint values, so
+    // that fixed-point approximate computations below are as accurate as
+    // possible.
+    const int16_t input_value_on_hires_input_scale = input_value * (1 << 7);
+    // Compute the input value on essentially the output scale, just not
+    // right-shifted yet. This is the value that we'll use in the (x >= +3)
+    // case, and that in the general case we'll multiply against the "relu-ish"
+    // fixed-point multiplier in [0, 1].
+    const int16_t input_value_on_preshift_output_scale =
+        gemmlowp::SaturatingRoundingDoublingHighMul(
+            input_value_on_hires_input_scale,
+            params.output_multiplier_fixedpoint_int16);
+    // Now compute the "relu-ish multiplier". In the (-3 <= x <= +3) case, that
+    // is just an affine rescaling of x from [-3, 3] to [0, 1]. In the general
+    // case, it is just that plus saturation at the boundaries of [-3, 3].
+    // First, we rescale from [-3, 3] to [-1, 1], saturating.
+    // That is done by rescaling the input value with a fixed-point multiplier
+    // (reluish_multiplier_fixedpoint) and bit-shift such that we represent
+    // that input value on the scale where the real value 3.0f is represented
+    // by the quantized value 32768.  (+32768 is actually not representable as
+    // int16_t, so this saturates at +32767, and that is seen empirically to be
+    // a negligible contribution to numerical error/bias).
+    //
+    // This code is careful to correctly implement any magnitude of multiplier,
+    // involving either a right shift or a left shift, with correct saturation
+    // behavior in the left-shift case. This forces this code to be more
+    // complicated, but is necessary for real applications: a partially
+    // trained quantized MobileNet v3-small model that motivated this code
+    // exhibits some large [min, max] range boundaries, of the order of
+    // magnitude of 10 or 100 depending on layers.
+    //
+    // The next few lines are basically just an ordinary
+    // MultiplyByQuantizedMultiplier, except that we are more careful here
+    // about the fine details of saturation when left-shifting, because here
+    // overflow in left-shift is a common case, not an anomaly as
+    // MultiplyByQuantizedMultiplier assumes.
+    int16_t reluish_value = input_value_on_hires_input_scale;
+    // Shift left, saturating, as much as we can while ensuring that this
+    // saturation will not contribute to the result. That is, left shift amount
+    // reduced by 1.
+    if (params.reluish_multiplier_exponent > 0) {
+      reluish_value = SaturatingLeftShift(
+          reluish_value, params.reluish_multiplier_exponent - 1);
+    }
+    // Apply the fixed-point multiplier, dividing the value by a divisor
+    // ranging in [1, 2].
+    reluish_value = gemmlowp::SaturatingRoundingDoublingHighMul(
+        reluish_value, params.reluish_multiplier_fixedpoint_int16);
+    // Apply the last bit of left-shift. Thus, in the left-shifting case, if
+    // any saturation affects the result, it is happening here --- any
+    // saturation having occurred above is overwritten here, not affecting the
+    // result.
+    if (params.reluish_multiplier_exponent > 0) {
+      reluish_value = SaturatingLeftShift(reluish_value, 1);
+    }
+    // Shift right, in the right-shifting case.
+    if (params.reluish_multiplier_exponent < 0) {
+      reluish_value = gemmlowp::RoundingDivideByPOT(
+          reluish_value, -params.reluish_multiplier_exponent);
+    }
+    // At this point we have rescaled the value into a 16bit fixedpoint
+    // reluish_value in [-1, 1].
+    // We now convert that to a 16bit fixedpoint value in [0, 1].
+    reluish_value = (reluish_value + (1 << 15)) >> 1;
+    // Use of SaturatingDoublingHighMul here is important to cancel the biases
+    // from the above SaturatingRoundingDoublingHighMul.
+    //
+    // On a partially trained MobileNet-v3-small,
+    //
+    //                                       | bias on    |  ImageNet
+    //                                       | quantized  |  Top-1
+    // Operation used here                   | values     |  accuracy (50k)
+    // --------------------------------------+------------+-----------
+    // SaturatingDoublingHighMul             | -0.0024    |  58.920
+    // SaturatingRoundingDoublingHighMul     | -0.0067    |  58.064
+    //
+    // In activations_test, this is covered by this testcase:
+    //     QuantizedActivationsOpTest.HardSwishBias
+    //
+    const int16_t preshift_output_value = SaturatingDoublingHighMul(
+        reluish_value, input_value_on_preshift_output_scale);
+    // We were so far operating on the pre-shift output scale. Now we finally
+    // apply that output shift, arriving at the final output scale.
+    int16_t output_value = gemmlowp::RoundingDivideByPOT(
+        preshift_output_value, -params.output_multiplier_exponent);
+    output_value += params.output_zero_point;
+    output_value =
+        std::min<int16_t>(output_value, std::numeric_limits<T>::max());
+    output_value =
+        std::max<int16_t>(output_value, std::numeric_limits<T>::min());
+    output_data[i] = output_value;
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
@@ -0,0 +1,145 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
+
+#include <limits>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void CheckArithmeticParams(const ArithmeticParams& params) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  // Input offset is negative input zero point. Activation tensors are
+  // asymmetric quantized so they span the full int8 range.
+  TFLITE_DCHECK_GE(-params.input1_offset, std::numeric_limits<int8_t>::min());
+  TFLITE_DCHECK_GE(-params.input2_offset, std::numeric_limits<int8_t>::min());
+  TFLITE_DCHECK_LE(-params.input1_offset, std::numeric_limits<int8_t>::max());
+  TFLITE_DCHECK_LE(-params.input2_offset, std::numeric_limits<int8_t>::max());
+}
+
+// Element-wise add that can often be used for inner loop of broadcast add as
+// well as the non-broadcast add.
+inline void AddElementwise(int size, const ArithmeticParams& params,
+                           const int8_t* input1_data, const int8_t* input2_data,
+                           int8_t* output_data) {
+  CheckArithmeticParams(params);
+
+  for (int i = 0; i < size; ++i) {
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32_t scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sum, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int8_t* input1_data,
+                const RuntimeShape& input2_shape, const int8_t* input2_data,
+                const RuntimeShape& output_shape, int8_t* output_data) {
+  CheckArithmeticParams(params);
+
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const int8_t* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const int8_t* input2_data,
+                               const RuntimeShape& output_shape,
+                               int8_t* output_data) {
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          const int32_t input1_val =
+              params.input1_offset +
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+          const int32_t input2_val =
+              params.input2_offset +
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+          const int32_t shifted_input1_val =
+              input1_val * (1 << params.left_shift);
+          const int32_t shifted_input2_val =
+              input2_val * (1 << params.left_shift);
+          const int32_t scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, params.input1_multiplier,
+                  params.input1_shift);
+          const int32_t scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, params.input2_multiplier,
+                  params.input2_shift);
+          const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+          const int32_t raw_output =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  raw_sum, params.output_multiplier, params.output_shift) +
+              params.output_offset;
+          const int32_t clamped_output =
+              std::min(params.quantized_activation_max,
+                       std::max(params.quantized_activation_min, raw_output));
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              static_cast<int8_t>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
@@ -0,0 +1,217 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+// Fixed-point per-channel-quantization convolution reference kernel.
+inline void ConvPerChannel(
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data) {
+  // Get parameters.
+  const int32_t input_offset = params.input_offset;  // r = s(q - Z)
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int32_t output_offset = params.output_offset;
+
+  // Set min and max value of the output.
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Consistency check.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  // Check dimensions of the tensors.
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  // Accumulate with 32 bits accumulator.
+                  // In the nudging process during model quantization, we force
+                  // real value of 0.0 be represented by a quantized value. This
+                  // guarantees that the input_offset is a int8_t, even though
+                  // it is represented using int32_t. int32_t += int8_t *
+                  // (int8_t - int8_t) so the highest value we can get from each
+                  // accumulation is [-127, 127] * ([-128, 127] -
+                  // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                  // = 14.98, which means we can accumulate at least 2^16
+                  // multiplications without overflow. The accumulator is
+                  // applied to a filter so the accumulation logic will hold as
+                  // long as the filter size (filter_y * filter_x * in_channel)
+                  // does not exceed 2^16, which is the case in all the models
+                  // we have seen so far.
+                  // TODO(jianlijianli): Add a check to make sure the
+                  // accumulator depth is smaller than 2^16.
+                  acc += filter_val * (input_val + input_offset);
+                }
+              }
+            }
+          }
+
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_shift[out_channel]);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<int8_t>(acc);
+        }
+      }
+    }
+  }
+}
+
+// Fixed-point per-channel-quantization convolution reference kernel.
+// 16-bit data and 8-bit filter
+inline void ConvPerChannel(
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int16_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const std::int64_t* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data) {
+  // Get parameters.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+
+  // Set min and max value of the output.
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Consistency check.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  // Check dimensions of the tensors.
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          std::int64_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  // Accumulate with 64 bits accumulator.
+                  // int64_t += int8_t * int16_t so the highest value we can
+                  // get from each accumulation is [-127, 127] * ([-32768,
+                  // 32767] -
+                  // [-32768, 32767]), which is [-8322945, 8322945].
+                  // log2(8322945) = 22.99.
+                  acc += filter_val * input_val;
+                }
+              }
+            }
+          }
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          int32_t scaled_acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_shift[out_channel]);
+          scaled_acc = std::max(scaled_acc, output_activation_min);
+          scaled_acc = std::min(scaled_acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<int16_t>(scaled_acc);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@@ -0,0 +1,289 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+inline void DepthwiseConvPerChannel(
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data) {
+  // Get parameters.
+  // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Check dimensions of the tensors.
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  // Accumulate with 32 bits accumulator.
+                  // In the nudging process during model quantization, we force
+                  // real value of 0.0 be represented by a quantized value. This
+                  // guarantees that the input_offset is a int8_t, even though
+                  // it is represented using int32_t. int32_t += int8_t *
+                  // (int8_t - int8_t) so the highest value we can get from each
+                  // accumulation is [-127, 127] * ([-128, 127] -
+                  // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                  // = 14.98, which means we can accumulate at least 2^16
+                  // multiplications without overflow. The accumulator is
+                  // applied to a filter so the accumulation logic will hold as
+                  // long as the filter size (filter_y * filter_x * in_channel)
+                  // does not exceed 2^16, which is the case in all the models
+                  // we have seen so far.
+                  // TODO(jianlijianli): Add a check to make sure the
+                  // accumulator depth is smaller than 2^16.
+                  acc += filter_val * (input_val + input_offset);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+            acc = MultiplyByQuantizedMultiplier(
+                acc, output_multiplier[output_channel],
+                output_shift[output_channel]);
+            acc += output_offset;
+            acc = std::max(acc, output_activation_min);
+            acc = std::min(acc, output_activation_max);
+            output_data[Offset(output_shape, batch, out_y, out_x,
+                               output_channel)] = static_cast<int8_t>(acc);
+          }
+        }
+      }
+    }
+  }
+}
+
+inline void DepthwiseConvPerChannel(
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int16_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const std::int64_t* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data) {
+  // Get parameters.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Check dimensions of the tensors.
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            std::int64_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  // Accumulate with 64 bits accumulator.
+                  // We assume maximum of 2^16 accumulations as with the 8-bit
+                  // case so actually the value in the accumulator should not
+                  // exceed 40 bits
+                  acc += static_cast<int64_t>(filter_val) *
+                         static_cast<int64_t>(input_val);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+            int32_t scaled_acc = MultiplyByQuantizedMultiplier(
+                acc, output_multiplier[output_channel],
+                output_shift[output_channel]);
+            scaled_acc = std::max(scaled_acc, output_activation_min);
+            scaled_acc = std::min(scaled_acc, output_activation_max);
+            output_data[Offset(output_shape, batch, out_y, out_x,
+                               output_channel)] =
+                static_cast<int16_t>(scaled_acc);
+          }
+        }
+      }
+    }
+  }
+}
+
+inline void DepthwiseConvHybridPerChannel(
+    const DepthwiseParams& params, float* scaling_factors_ptr,
+    const RuntimeShape& input_shape, const int8_t* input_data,
+    const RuntimeShape& filter_shape, const int8_t* filter_data,
+    const RuntimeShape& bias_shape, const float* bias_data,
+    const RuntimeShape& output_shape, float* output_data,
+    const float* per_channel_scale, int32_t* input_offset) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  // Check dimensions of the tensors.
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int bias_depth = bias_shape.FlatSize();
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_depth, output_depth);
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  acc += filter_val * (input_val - input_offset[batch]);
+                }
+              }
+            }
+            float acc_float = static_cast<float>(acc);
+            acc_float *=
+                per_channel_scale[output_channel] * scaling_factors_ptr[batch];
+            if (bias_data && output_channel < bias_depth) {
+              acc_float += bias_data[output_channel];
+            }
+            output_data[Offset(output_shape, batch, out_y, out_x,
+                               output_channel)] =
+                ActivationFunctionWithMinMax(acc_float, output_activation_min,
+                                             output_activation_max);
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
@@ -0,0 +1,108 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data) {
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = output_shape.Dims(0);
+  const int output_depth = output_shape.Dims(1);
+  TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      int32_t acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+      acc += output_offset;
+      acc = std::max(acc, output_activation_min);
+      acc = std::min(acc, output_activation_max);
+      output_data[out_c + output_depth * b] = static_cast<int8_t>(acc);
+    }
+  }
+}
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const int16_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int64_t* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data) {
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = output_shape.Dims(0);
+  const int output_depth = output_shape.Dims(1);
+  TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      int64_t acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
+        acc += (filter_val + filter_offset) * input_val;
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+      }
+      int32_t acc_scaled =
+          MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+      acc_scaled = std::max(acc_scaled, output_activation_min);
+      acc_scaled = std::min(acc_scaled, output_activation_max);
+      output_data[out_c + output_depth * b] = static_cast<int16_t>(acc_scaled);
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h
@@ -0,0 +1,65 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_L2NORMALIZATION_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_L2NORMALIZATION_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void L2Normalization(int32_t input_zero_point, int32_t outer_size,
+                            int32_t depth, const int8_t* input_data,
+                            int8_t* output_data) {
+  static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
+  static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
+  // The output scale must be in sync with Prepare().
+  // Output is in 1/128 scale so the actual output range is nudged from [-1, 1]
+  // to [-1, 127/128].
+  static constexpr int32_t kOutputScale = 7;
+  for (int outer_index = 0; outer_index < outer_size; ++outer_index) {
+    // int32_t = (int8_t - int8_t) ^ 2.
+    // ([-128, 127] - [-128, 127]) ^ 2 = [0, (2^8 - 1)^2] so the accumulator is
+    // safe from overflowing in at least 2^16 steps.
+    int32_t acc = 0;
+    for (int inner_index = 0; inner_index < depth; ++inner_index) {
+      int32_t input =
+          input_data[depth * outer_index + inner_index] - input_zero_point;
+      acc += input * input;
+    }
+    int32_t inv_l2norm_multiplier;
+    int inv_l2norm_shift;
+    GetInvSqrtQuantizedMultiplierExp(acc, kReverseShift, &inv_l2norm_multiplier,
+                                     &inv_l2norm_shift);
+
+    for (int inner_index = 0; inner_index < depth; ++inner_index) {
+      int32_t input =
+          input_data[depth * outer_index + inner_index] - input_zero_point;
+
+      // Rescale and downcast. Rescale is folded into the division.
+      int32_t output_in_q24 = MultiplyByQuantizedMultiplier(
+          input, inv_l2norm_multiplier, inv_l2norm_shift + kOutputScale);
+      output_in_q24 =
+          std::min(static_cast<int32_t>(kMaxInt8),
+                   std::max(static_cast<int32_t>(kMinInt8), output_in_q24));
+      output_data[depth * outer_index + inner_index] =
+          static_cast<int8_t>(output_in_q24);
+    }
+  }
+}
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_L2NORMALIZATION_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
@@ -0,0 +1,99 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOGISTIC_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOGISTIC_H_
+
+#include <limits>
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void Logistic(int32_t input_zero_point, int32_t input_range_radius,
+                     int32_t input_multiplier, int32_t input_left_shift,
+                     int32_t input_size, const int8_t* input_data,
+                     int8_t* output_data) {
+  // Integer bits must be in sync with Prepare() function.
+  static constexpr int32_t kInputIntegerBits = 4;
+  static constexpr int32_t kOutputIntegerBits = 8;
+  static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
+  static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
+  static constexpr int32_t kOutputZeroPoint = -128;
+
+  for (int i = 0; i < input_size; ++i) {
+    const int32_t input =
+        static_cast<int32_t>(input_data[i]) - input_zero_point;
+    if (input <= -input_range_radius) {
+      output_data[i] = kMinInt8;
+    } else if (input >= input_range_radius) {
+      output_data[i] = kMaxInt8;
+    } else {
+      const int32_t input_in_q4 = MultiplyByQuantizedMultiplier(
+          input, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
+      const int32_t output_in_q0 =
+          gemmlowp::logistic(FixedPoint4::FromRaw(input_in_q4)).raw();
+
+      // Rescale and downcast.
+      using gemmlowp::RoundingDivideByPOT;
+      int32_t output_in_q23 =
+          RoundingDivideByPOT(output_in_q0, 31 - kOutputIntegerBits);
+      output_in_q23 = std::min(std::max(output_in_q23 + kOutputZeroPoint,
+                                        static_cast<int32_t>(kMinInt8)),
+                               static_cast<int32_t>(kMaxInt8));
+      output_data[i] = static_cast<int8_t>(output_in_q23);
+    }
+  }
+}
+
+inline void Logistic(int32_t input_multiplier, int32_t input_size,
+                     const int16_t* ptr_input_data, int16_t* ptr_output_data) {
+  // We use the LUT for sigmoid and take into account, that
+  // tanh(x) = 2*sigmoid(2*x) - 1
+
+  int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
+
+  for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
+    int32_t input_data = (*ptr_input_data) * input_data_mul;
+
+    // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7] and
+    // we do interpolation on unsigned values.
+    uint32_t abs_input_data = 3 * abs(input_data);
+
+    // We divide by 2 power of 9, because
+    // we need to divide by 2 in power of 7 for
+    // the input conversion + 1/4 from the scale above.
+    uint8_t uh = abs_input_data >> 9;
+    uint32_t ua = sigmoid_table_uint16[uh];
+    uint32_t ub = sigmoid_table_uint16[uh + 1];
+    uint32_t ut = abs_input_data & 0x1ff;
+
+    // Interpolation is done using the fractional bit.
+    uint32_t result = (ua << 9) + ut * (ub - ua);
+
+    result = (input_data >= 0) ? (result + (1 << 9))
+                               : ((1 << (16 + 9)) - result + (1 << 9) - 1);
+
+    // Back to 16-bit.
+    result >>= 10;
+
+    *ptr_output_data = result;
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOGISTIC_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
@@ -0,0 +1,131 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
+
+#include "fixedpoint/fixedpoint.h"
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+template <typename T>
+inline void MulElementwise(int size, const ArithmeticParams& params,
+                           const T* input1_data, const T* input2_data,
+                           T* output_data) {
+  for (int i = 0; i < size; ++i) {
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                      params.output_multiplier,
+                                      params.output_shift);
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<T>(clamped_output);
+  }
+}
+
+template <typename T>
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const T* input1_data,
+                const RuntimeShape& input2_shape, const T* input2_data,
+                const RuntimeShape& output_shape, T* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  ruy::profiler::ScopeLabel label("Mul/8bit");
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+// Mul with 16 bit inputs and int8_t outputs.
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
+                const RuntimeShape& output_shape, int8_t* output_data) {
+  ruy::profiler::ScopeLabel label("Mul/Int16Int8");
+  int32_t output_offset = params.output_offset;
+  int32_t output_activation_min = params.quantized_activation_min;
+  int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 unclamped_result =
+        F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+    int16_t rescaled_result =
+        gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
+    int16_t clamped_result = std::min<int16_t>(
+        output_activation_max - output_offset, rescaled_result);
+    clamped_result = std::max<int16_t>(output_activation_min - output_offset,
+                                       clamped_result);
+    output_data[i] = output_offset + clamped_result;
+  }
+}
+
+template <typename T>
+inline void BroadcastMul4DSlow(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastMul4DSlow");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  // The input shapes are extended as part of NdArrayDesc initialization.
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          const int32_t input1_val =
+              params.input1_offset +
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+          const int32_t input2_val =
+              params.input2_offset +
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+          const int32_t unclamped_result =
+              params.output_offset +
+              MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                            params.output_multiplier,
+                                            params.output_shift);
+          const int32_t clamped_output = std::min(
+              params.quantized_activation_max,
+              std::max(params.quantized_activation_min, unclamped_result));
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              static_cast<T>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
@@ -0,0 +1,258 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_
+
+#include <limits>
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void AveragePool(const PoolParams& params,
+                        const RuntimeShape& input_shape,
+                        const int8_t* input_data,
+                        const RuntimeShape& output_shape, int8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          int32_t acc = 0;
+          int filter_count = 0;
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              acc +=
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)];
+              filter_count++;
+            }
+          }
+          // Round to the closest integer value.
+          acc = acc > 0 ? (acc + filter_count / 2) / filter_count
+                        : (acc - filter_count / 2) / filter_count;
+          acc = std::max(acc, params.quantized_activation_min);
+          acc = std::min(acc, params.quantized_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              static_cast<int8_t>(acc);
+        }
+      }
+    }
+  }
+}
+
+inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
+                    const int8_t* input_data, const RuntimeShape& output_shape,
+                    int8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_GE(params.quantized_activation_min,
+                   std::numeric_limits<int8_t>::min());
+  TFLITE_DCHECK_LE(params.quantized_activation_max,
+                   std::numeric_limits<int8_t>::max());
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          int8_t max = std::numeric_limits<int8_t>::lowest();
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              max = std::max(
+                  max,
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
+            }
+          }
+          max = std::max<int8_t>(max, params.quantized_activation_min);
+          max = std::min<int8_t>(max, params.quantized_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              static_cast<int8_t>(max);
+        }
+      }
+    }
+  }
+}
+
+inline void AveragePool(const PoolParams& params,
+                        const RuntimeShape& input_shape,
+                        const int16_t* input_data,
+                        const RuntimeShape& output_shape,
+                        int16_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          int32_t acc = 0;
+          int filter_count = 0;
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              acc +=
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)];
+              filter_count++;
+            }
+          }
+          // Round to the closest integer value.
+          acc = acc > 0 ? (acc + filter_count / 2) / filter_count
+                        : (acc - filter_count / 2) / filter_count;
+          acc = std::max(acc, params.quantized_activation_min);
+          acc = std::min(acc, params.quantized_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              static_cast<int16_t>(acc);
+        }
+      }
+    }
+  }
+}
+
+inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
+                    const int16_t* input_data, const RuntimeShape& output_shape,
+                    int16_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_GE(params.quantized_activation_min,
+                   std::numeric_limits<int16_t>::min());
+  TFLITE_DCHECK_LE(params.quantized_activation_max,
+                   std::numeric_limits<int16_t>::max());
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          int16_t max = std::numeric_limits<int16_t>::lowest();
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              max = std::max(
+                  max,
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
+            }
+          }
+          max = std::max<int16_t>(max, params.quantized_activation_min);
+          max = std::min<int16_t>(max, params.quantized_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              static_cast<int16_t>(max);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
@@ -0,0 +1,106 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
+
+#include <limits>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void Tanh(int32_t input_zero_point, int32_t input_range_radius,
+                 int32_t input_multiplier, int32_t input_shift,
+                 int32_t input_size, const int8_t* input_data,
+                 int8_t* output_data) {
+  // Integer bits must be in sync with Prepare() function.
+  static constexpr int32_t kInputIntegerBits = 4;
+  static constexpr int32_t kOutputScale = 7;
+  static constexpr int32_t kMinInt8 = std::numeric_limits<int8_t>::min();
+  static constexpr int32_t kMaxInt8 = std::numeric_limits<int8_t>::max();
+  using F4 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
+
+  for (int i = 0; i < input_size; ++i) {
+    const int32_t input =
+        static_cast<int32_t>(input_data[i]) - input_zero_point;
+    if (input <= -input_range_radius) {
+      output_data[i] = kMinInt8;
+    } else if (input >= input_range_radius) {
+      output_data[i] = kMaxInt8;
+    } else {
+      const int32_t input_in_q4 =
+          MultiplyByQuantizedMultiplier(input, input_multiplier, input_shift);
+      const int32_t output_in_q0 =
+          gemmlowp::tanh(F4::FromRaw(input_in_q4)).raw();
+
+      // Rescale and downcast.
+      using gemmlowp::RoundingDivideByPOT;
+      int32_t output_in_q24 =
+          RoundingDivideByPOT(output_in_q0, 31 - kOutputScale);
+      output_in_q24 = std::min(std::max(output_in_q24, kMinInt8), kMaxInt8);
+      output_data[i] = static_cast<int8_t>(output_in_q24);
+    }
+  }
+}
+
+inline void Tanh(int32_t input_multiplier, int32_t input_left_shift,
+                 int32_t input_size, const int16_t* ptr_input_data,
+                 int16_t* ptr_output_data) {
+  // We use the LUT for sigmoid and take into account, that
+  // tanh(x) = 2*sigmoid(2*x) - 1
+
+  int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
+
+  for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
+    int32_t input_data = (*ptr_input_data) * input_data_mul;
+
+    if (input_left_shift == 1) {
+      input_data <<= 1;
+    }
+
+    // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
+    uint32_t abs_input_data = 3 * abs(input_data);
+    uint32_t uh = abs_input_data >> 8;
+    int32_t result;
+
+    if (uh >= 255) {
+      // Saturate to maximum.
+      result = 0xFFFF << 8;
+    } else {
+      uint32_t ua = sigmoid_table_uint16[uh];
+      uint32_t ub = sigmoid_table_uint16[uh + 1];
+
+      uint8_t ut = abs_input_data & 0xFF;
+
+      result = (ua << 8) + ut * (ub - ua);
+    }
+
+    result = (input_data >= 0)
+                 ? (result - (1 << (14 + 9)) + (1 << (9 - 2)))
+                 : (-result + (1 << (14 + 9)) + (1 << (9 - 2)) - 1);
+
+    // Convert back to 16-bit.
+    result >>= (9 - 1);
+
+    *ptr_output_data = result;
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/l2normalization.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/l2normalization.h
@@ -0,0 +1,90 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_
+
+#include <algorithm>
+#include <cmath>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
+                            const RuntimeShape& input_shape,
+                            const float* input_data,
+                            const RuntimeShape& output_shape,
+                            float* output_data, float epsilon = 1e-6) {
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  for (int i = 0; i < outer_size; ++i) {
+    float squared_l2_norm = 0;
+    for (int c = 0; c < depth; ++c) {
+      const float val = input_data[depth * i + c];
+      squared_l2_norm += val * val;
+    }
+    float l2_norm = std::sqrt(squared_l2_norm);
+    l2_norm = std::max(l2_norm, epsilon);
+    for (int c = 0; c < depth; ++c) {
+      output_data[depth * i + c] = input_data[depth * i + c] / l2_norm;
+    }
+  }
+}
+
+inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
+                            const RuntimeShape& input_shape,
+                            const uint8_t* input_data,
+                            const RuntimeShape& output_shape,
+                            uint8_t* output_data) {
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int32_t input_zero_point = op_params.input_zero_point;
+
+  for (int i = 0; i < outer_size; ++i) {
+    int32_t square_l2_norm = 0;
+    for (int c = 0; c < depth; c++) {
+      int32_t diff = input_data[depth * i + c] - input_zero_point;
+      square_l2_norm += diff * diff;
+    }
+    int32_t inv_l2norm_multiplier;
+    int inv_l2norm_shift;
+    GetInvSqrtQuantizedMultiplierExp(square_l2_norm, kReverseShift,
+                                     &inv_l2norm_multiplier, &inv_l2norm_shift);
+    for (int c = 0; c < depth; c++) {
+      int32_t diff = input_data[depth * i + c] - input_zero_point;
+      int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
+      int32_t unclamped_output_val = 128 + rescaled_diff;
+      int32_t output_val =
+          std::min(static_cast<int32_t>(255),
+                   std::max(static_cast<int32_t>(0), unclamped_output_val));
+      output_data[depth * i + c] = static_cast<uint8_t>(output_val);
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/logistic.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/logistic.h
@@ -0,0 +1,132 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOGISTIC_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOGISTIC_H_
+
+#include <cmath>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
+                     const RuntimeShape& output_shape, float* output_data) {
+  const float cutoff_upper = 16.619047164916992188f;
+  const float cutoff_lower = -9.f;
+
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  // Rational for using approximation in reference kernel.
+  // 0. This approximation gives enough precision for float.
+  // 1. This works around an issue on an embedded chipset where exp() does not
+  // return correctly as expected - exp(x) should return inf when overflown
+  // not 1.701417   IEEE 754 defines representation for inf.
+  // 2. This will speed up calculation and is matching the behavior in the
+  // optimized kernels. (check the definition of scalar_logistic_op<float>)
+
+  for (int i = 0; i < flat_size; i++) {
+    float val = input_data[i];
+    float result;
+    if (val > cutoff_upper) {
+      result = 1.0f;
+    } else if (val < cutoff_lower) {
+      result = std::exp(val);
+    } else {
+      result = 1.f / (1.f + std::exp(-val));
+    }
+    output_data[i] = result;
+  }
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// uniform between data types.
+inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape,
+                     const float* input_data, const RuntimeShape& output_shape,
+                     float* output_data) {
+  // Drop params: not needed.
+  Logistic(input_shape, input_data, output_shape, output_data);
+}
+
+inline void Logistic(const LogisticParams& params,
+                     const RuntimeShape& input_shape, const int16_t* input_data,
+                     const RuntimeShape& output_shape, int16_t* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+
+    const F3 input = F3::FromRaw(input_data[i]);
+    F0 output = gemmlowp::logistic(input);
+    output_data[i] = output.raw();
+  }
+}
+
+// Quantized int8_t logistic activation.  Cheats by dequantizing and
+// requantizing around the floating point logistic method.  This implementation
+// is slow on platforms without a floating point unit.
+
+// TODO(b/141211002): Delete this int8_t implementation once we can reuse the
+// approach used in TFLite for int8_t Logistic.
+inline void Logistic(const RuntimeShape& input_shape, const int8_t* input_data,
+                     float input_scale, int input_zero_point,
+                     const RuntimeShape& output_shape, int8_t* output_data,
+                     float output_scale, int output_zero_point) {
+  const float cutoff_upper = 16.619047164916992188f;
+  const float cutoff_lower = -9.f;
+
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  // Rational for using approximation in reference kernel.
+  // 0. This approximation gives enough precision for float.
+  // 1. This works around an issue on an embedded chipset where exp() does not
+  // return correctly as expected - exp(x) should return inf when overflown
+  // not 1.701417   IEEE 754 defines representation for inf.
+  // 2. This will speed up calculation and is matching the behavior in the
+  // optimized kernels. (check the definition of scalar_logistic_op<float>)
+
+  for (int i = 0; i < flat_size; i++) {
+    // Dequantize.
+    float val =
+        static_cast<float>((input_data[i] - input_zero_point) * input_scale);
+    float result;
+    if (val > cutoff_upper) {
+      result = 1.0f;
+    } else if (val < cutoff_lower) {
+      result = std::exp(val);
+    } else {
+      result = 1.f / (1.f + std::exp(-val));
+    }
+    // Requantize
+    int8_t output =
+        static_cast<int8_t>(result / output_scale + output_zero_point);
+    output_data[i] = output;
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOGISTIC_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/maximum_minimum.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/maximum_minimum.h
@@ -0,0 +1,64 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MAXIMUM_MINIMUM_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MAXIMUM_MINIMUM_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename T, typename Op, int N = 5>
+void MaximumMinimumBroadcastSlow(const RuntimeShape& unextended_input1_shape,
+                                 const T* input1_data,
+                                 const RuntimeShape& unextended_input2_shape,
+                                 const T* input2_data,
+                                 const RuntimeShape& unextended_output_shape,
+                                 T* output_data, Op op) {
+  // Uses element-wise calculation if broadcast is not required.
+  if (unextended_input1_shape == unextended_input2_shape) {
+    const int flat_size =
+        MatchingElementsSize(unextended_input1_shape, unextended_input2_shape,
+                             unextended_output_shape);
+    for (int i = 0; i < flat_size; ++i) {
+      output_data[i] = op(input1_data[i], input2_data[i]);
+    }
+  } else {
+    TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
+    TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
+    TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
+
+    NdArrayDesc<N> desc1;
+    NdArrayDesc<N> desc2;
+    NdArrayDesc<N> output_desc;
+    NdArrayDescsForElementwiseBroadcast(
+        unextended_input1_shape, unextended_input2_shape, &desc1, &desc2);
+    CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
+                   &output_desc);
+
+    auto maxmin_func = [&](int indexes[N]) {
+      output_data[SubscriptToIndex(output_desc, indexes)] =
+          op(input1_data[SubscriptToIndex(desc1, indexes)],
+             input2_data[SubscriptToIndex(desc2, indexes)]);
+    };
+    NDOpsHelper<N>(output_desc, maxmin_func);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MAXIMUM_MINIMUM_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/mul.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/mul.h
@@ -0,0 +1,166 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+// Element-wise mul that can often be used for inner loop of broadcast Mul as
+// well as the non-broadcast Mul.
+inline void MulElementwise(int size, const ArithmeticParams& params,
+                           const uint8_t* input1_data,
+                           const uint8_t* input2_data, uint8_t* output_data) {
+  for (int i = 0; i < size; ++i) {
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                      params.output_multiplier,
+                                      params.output_shift);
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<uint8_t>(clamped_output);
+  }
+}
+
+template <typename T>
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const T* input1_data,
+                const RuntimeShape& input2_shape, const T* input2_data,
+                const RuntimeShape& output_shape, T* output_data) {
+  T output_activation_min;
+  T output_activation_max;
+  GetActivationParams(params, &output_activation_min, &output_activation_max);
+
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] * input2_data[i], output_activation_min,
+        output_activation_max);
+  }
+}
+
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+
+  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void BroadcastMul4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const uint8_t* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const uint8_t* input2_data,
+                               const RuntimeShape& output_shape,
+                               uint8_t* output_data) {
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          const int32_t input1_val =
+              params.input1_offset +
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+          const int32_t input2_val =
+              params.input2_offset +
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+          const int32_t unclamped_result =
+              params.output_offset +
+              MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                            params.output_multiplier,
+                                            params.output_shift);
+          const int32_t clamped_output = std::min(
+              params.quantized_activation_max,
+              std::max(params.quantized_activation_min, unclamped_result));
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              static_cast<uint8_t>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void BroadcastMul4DSlow(const ArithmeticParams& params,
+                        const RuntimeShape& unextended_input1_shape,
+                        const T* input1_data,
+                        const RuntimeShape& unextended_input2_shape,
+                        const T* input2_data,
+                        const RuntimeShape& unextended_output_shape,
+                        T* output_data) {
+  T output_activation_min;
+  T output_activation_max;
+  GetActivationParams(params, &output_activation_min, &output_activation_max);
+
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < output_shape.Dims(0); ++b) {
+    for (int y = 0; y < output_shape.Dims(1); ++y) {
+      for (int x = 0; x < output_shape.Dims(2); ++x) {
+        for (int c = 0; c < output_shape.Dims(3); ++c) {
+          output_data[Offset(output_shape, b, y, x, c)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] *
+                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/neg.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/neg.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_NEG_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_NEG_H_
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+template <typename T>
+inline void Negate(const RuntimeShape& input_shape, const T* input_data,
+                   const RuntimeShape& output_shape, T* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = -input_data[i];
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_NEG_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/pad.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/pad.h
@@ -0,0 +1,162 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PAD_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PAD_H_
+
+#include <vector>
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+// TFLite Pad supports activation tensors with up to 4 dimensions.
+constexpr int PadKernelMaxDimensionCount() { return 4; }
+
+// There are two versions of pad: Pad and PadV2.  In PadV2 there is a second
+// scalar input that provides the padding value.  Therefore pad_value_ptr can be
+// equivalent to a simple input1_data.  For Pad, it should point to a zero
+// value.
+//
+// Note that two typenames are required, so that T=P=int32_t is considered a
+// specialization distinct from P=int32_t.
+template <typename T, typename P>
+inline void PadImpl(const tflite::PadParams& op_params,
+                    const RuntimeShape& input_shape, const T* input_data,
+                    const P* pad_value_ptr, const RuntimeShape& output_shape,
+                    T* output_data) {
+  const RuntimeShape ext_input_shape =
+      RuntimeShape::ExtendedShape(PadKernelMaxDimensionCount(), input_shape);
+  const RuntimeShape ext_output_shape =
+      RuntimeShape::ExtendedShape(PadKernelMaxDimensionCount(), output_shape);
+  TFLITE_DCHECK_LE(op_params.left_padding_count, PadKernelMaxDimensionCount());
+  TFLITE_DCHECK_LE(op_params.right_padding_count, PadKernelMaxDimensionCount());
+
+  // Runtime calls are currently fixed at 4 dimensions. Copy inputs so we can
+  // pad them to 4 dims (yes, we are "padding the padding").
+  int left_padding_copy[PadKernelMaxDimensionCount()];
+  for (int i = 0; i < PadKernelMaxDimensionCount(); i++) {
+    left_padding_copy[i] = 0;
+  }
+  for (int i = 0; i < op_params.left_padding_count; ++i) {
+    left_padding_copy[i + PadKernelMaxDimensionCount() -
+                      op_params.left_padding_count] = op_params.left_padding[i];
+  }
+  int right_padding_copy[PadKernelMaxDimensionCount()];
+  for (int i = 0; i < PadKernelMaxDimensionCount(); i++) {
+    right_padding_copy[i] = 0;
+  }
+  for (int i = 0; i < op_params.right_padding_count; ++i) {
+    right_padding_copy[i + PadKernelMaxDimensionCount() -
+                       op_params.right_padding_count] =
+        op_params.right_padding[i];
+  }
+
+  const int output_batch = ext_output_shape.Dims(0);
+  const int output_height = ext_output_shape.Dims(1);
+  const int output_width = ext_output_shape.Dims(2);
+  const int output_depth = ext_output_shape.Dims(3);
+
+  const int left_b_padding = left_padding_copy[0];
+  const int left_h_padding = left_padding_copy[1];
+  const int left_w_padding = left_padding_copy[2];
+  const int left_d_padding = left_padding_copy[3];
+
+  const int right_b_padding = right_padding_copy[0];
+  const int right_h_padding = right_padding_copy[1];
+  const int right_w_padding = right_padding_copy[2];
+  const int right_d_padding = right_padding_copy[3];
+
+  const T pad_value = *pad_value_ptr;
+
+  const T* in_ptr = input_data;
+  T* out_ptr = output_data;
+  for (int out_b = 0; out_b < output_batch; ++out_b) {
+    for (int out_h = 0; out_h < output_height; ++out_h) {
+      for (int out_w = 0; out_w < output_width; ++out_w) {
+        for (int out_d = 0; out_d < output_depth; ++out_d) {
+          if (out_b < left_b_padding ||
+              out_b >= output_batch - right_b_padding ||
+              out_h < left_h_padding ||
+              out_h >= output_height - right_h_padding ||
+              out_w < left_w_padding ||
+              out_w >= output_width - right_w_padding ||
+              out_d < left_d_padding ||
+              out_d >= output_depth - right_d_padding) {
+            *out_ptr++ = pad_value;
+          } else {
+            *out_ptr++ = *in_ptr++;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename P>
+inline void Pad(const tflite::PadParams& op_params,
+                const RuntimeShape& input_shape, const T* input_data,
+                const P* pad_value_ptr, const RuntimeShape& output_shape,
+                T* output_data) {
+  PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+          output_data);
+}
+
+// The second (pad-value) input can be int32_t when, say, the first is uint8_t.
+template <typename T>
+inline void Pad(const tflite::PadParams& op_params,
+                const RuntimeShape& input_shape, const T* input_data,
+                const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
+                T* output_data) {
+  const T converted_pad_value = static_cast<T>(*pad_value_ptr);
+  PadImpl(op_params, input_shape, input_data, &converted_pad_value,
+          output_shape, output_data);
+}
+
+// This version avoids conflicting template matching.
+template <>
+inline void Pad(const tflite::PadParams& op_params,
+                const RuntimeShape& input_shape, const int32_t* input_data,
+                const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
+                int32_t* output_data) {
+  PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+          output_data);
+}
+
+template <typename T, typename P>
+inline void PadImageStyle(const tflite::PadParams& op_params,
+                          const RuntimeShape& input_shape, const T* input_data,
+                          const P* pad_value_ptr,
+                          const RuntimeShape& output_shape, T* output_data) {
+  Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+      output_data);
+}
+
+template <typename P>
+inline void PadImageStyle(const tflite::PadParams& op_params,
+                          const RuntimeShape& input_shape,
+                          const float* input_data, const P* pad_value_ptr,
+                          const RuntimeShape& output_shape,
+                          float* output_data) {
+  Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+      output_data);
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PAD_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/pooling.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/pooling.h
@@ -0,0 +1,297 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_POOLING_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_POOLING_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void AveragePool(const PoolParams& params,
+                        const RuntimeShape& input_shape,
+                        const float* input_data,
+                        const RuntimeShape& output_shape, float* output_data) {
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          float total = 0.f;
+          float filter_count = 0;
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              total +=
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)];
+              filter_count++;
+            }
+          }
+          const float average = total / filter_count;
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              ActivationFunctionWithMinMax(average, params.float_activation_min,
+                                           params.float_activation_max);
+        }
+      }
+    }
+  }
+}
+
+inline void AveragePool(const PoolParams& params,
+                        const RuntimeShape& input_shape,
+                        const uint8_t* input_data,
+                        const RuntimeShape& output_shape,
+                        uint8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          int32_t acc = 0;
+          int filter_count = 0;
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              acc +=
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)];
+              filter_count++;
+            }
+          }
+          acc = (acc + filter_count / 2) / filter_count;
+          acc = std::max(acc, params.quantized_activation_min);
+          acc = std::min(acc, params.quantized_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              static_cast<uint8_t>(acc);
+        }
+      }
+    }
+  }
+}
+
+inline void L2Pool(const PoolParams& params, const RuntimeShape& input_shape,
+                   const float* input_data, const RuntimeShape& output_shape,
+                   float* output_data) {
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          float sum_squares = 0.f;
+          int filter_count = 0;
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              const float val =
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)];
+              sum_squares += val * val;
+              filter_count++;
+            }
+          }
+          const float l2pool_result = std::sqrt(sum_squares / filter_count);
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              ActivationFunctionWithMinMax(l2pool_result,
+                                           params.float_activation_min,
+                                           params.float_activation_max);
+        }
+      }
+    }
+  }
+}
+
+inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
+                    const float* input_data, const RuntimeShape& output_shape,
+                    float* output_data) {
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          float max = std::numeric_limits<float>::lowest();
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              max = std::max(
+                  max,
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
+            }
+          }
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              ActivationFunctionWithMinMax(max, params.float_activation_min,
+                                           params.float_activation_max);
+        }
+      }
+    }
+  }
+}
+
+inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
+                    const uint8_t* input_data, const RuntimeShape& output_shape,
+                    uint8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_GE(params.quantized_activation_min, 0);
+  TFLITE_DCHECK_LE(params.quantized_activation_max, 255);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          uint8_t max = 0;
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              max = std::max(
+                  max,
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
+            }
+          }
+          max = std::max<uint8_t>(max, params.quantized_activation_min);
+          max = std::min<uint8_t>(max, params.quantized_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              static_cast<uint8_t>(max);
+        }
+      }
+    }
+  }
+}
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_POOLING_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/prelu.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/prelu.h
@@ -0,0 +1,109 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PRELU_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PRELU_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+// Broadcast prelu to output_shape for quantized uint8_t/int8_t data.
+template <typename T>
+inline void BroadcastPrelu4DSlow(
+    const PreluParams& params, const RuntimeShape& input_shape,
+    const T* input_data, const RuntimeShape& alpha_shape, const T* alpha_data,
+    const RuntimeShape& output_shape, T* output_data) {
+  TFLITE_DCHECK_LE(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(alpha_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input_shape, alpha_shape, &desc1, &desc2);
+
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          int output_index = Offset(extended_output_shape, b, y, x, c);
+          int input_index = SubscriptToIndex(desc1, b, y, x, c);
+          const int32_t input_value =
+              params.input_offset + input_data[input_index];
+          int32_t output_value;
+          if (input_value >= 0) {
+            output_value = MultiplyByQuantizedMultiplier(
+                input_value, params.output_multiplier_1, params.output_shift_1);
+          } else {
+            auto alpha_index = SubscriptToIndex(desc2, b, y, x, c);
+            const int32_t alpha_value =
+                params.alpha_offset + alpha_data[alpha_index];
+
+            output_value = MultiplyByQuantizedMultiplier(
+                input_value * alpha_value, params.output_multiplier_2,
+                params.output_shift_2);
+          }
+          output_value += params.output_offset;
+
+          const int32_t quantized_min = std::numeric_limits<T>::min();
+          const int32_t quantized_max = std::numeric_limits<T>::max();
+          const int32_t clamped_output =
+              std::min(quantized_max, std::max(quantized_min, output_value));
+          output_data[output_index] = static_cast<T>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void Prelu(const PreluParams& params, const RuntimeShape& input_shape,
+                  const T* input_data, const RuntimeShape& alpha_shape,
+                  const T* alpha_data, const RuntimeShape& output_shape,
+                  T* output_data) {
+  const int32_t quantized_min = std::numeric_limits<T>::min();
+  const int32_t quantized_max = std::numeric_limits<T>::max();
+
+  const int flat_size =
+      MatchingElementsSize(input_shape, alpha_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const int32_t input_value = params.input_offset + input_data[i];
+    int32_t output_value;
+    if (input_value >= 0) {
+      output_value = MultiplyByQuantizedMultiplier(
+          input_value, params.output_multiplier_1, params.output_shift_1);
+    } else {
+      const int32_t alpha_value = params.alpha_offset + alpha_data[i];
+
+      output_value = MultiplyByQuantizedMultiplier(input_value * alpha_value,
+                                                   params.output_multiplier_2,
+                                                   params.output_shift_2);
+    }
+    output_value += params.output_offset;
+
+    const int32_t clamped_output =
+        std::min(quantized_max, std::max(quantized_min, output_value));
+    output_data[i] = static_cast<T>(clamped_output);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PRELU_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h
@@ -0,0 +1,138 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PROCESS_BROADCAST_SHAPES_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PROCESS_BROADCAST_SHAPES_H_
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+// Consolidates dimensions in broadcast inputs, checks for five-fold pattern.
+//
+// For example, if sequence of dimensions of one input is
+// ..., 1, 3, 1, 7, 9, 5,... and the other is ..., 2, 3, 1, 7, 1, 1, ...
+// we can consolidate these as
+// ..., 1, 3*7, 9*5, ... and 2, 3*7, 1.
+//
+// The category is updated in the less-frequent case of shapes that are
+// not suited to a fivefold-loop broadcast.
+//
+// Falls back to generic pattern when it does not know how to process properly.
+//
+// Returns true iff there is some sort of broadcast, which includes five-fold
+// patterns and falling back to generic broadcast.
+inline bool ProcessBroadcastShapes(const RuntimeShape& shape0,
+                                   const RuntimeShape& shape1,
+                                   tflite::ArithmeticParams* params) {
+  const int dims_count =
+      std::max(shape0.DimensionsCount(), shape1.DimensionsCount());
+
+  params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
+  RuntimeShape scalar_shape(dims_count, 1);
+
+  auto extended_shape0 = RuntimeShape::ExtendedShape(dims_count, shape0);
+  auto extended_shape1 = RuntimeShape::ExtendedShape(dims_count, shape1);
+
+  // Check for "exact" match, implicitly accepting any scalar shapes.
+  if (extended_shape0 == extended_shape1) {
+    params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
+    return false;
+  }
+
+  for (int i = dims_count - 1; i >= 0; --i) {
+    if (extended_shape0.Dims(i) == extended_shape1.Dims(i)) {
+      continue;
+    } else if (extended_shape0.Dims(i) == 1) {
+      params->broadcast_category =
+          BroadcastableOpCategory::kFirstInputBroadcastsFast;
+      break;
+    } else if (extended_shape1.Dims(i) == 1) {
+      params->broadcast_category =
+          BroadcastableOpCategory::kSecondInputBroadcastsFast;
+      break;
+    } else {
+      // This case is erroneous: there is a dimension that does not match and
+      // is not a broadcast from one shape to the other.
+      params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
+      return true;
+    }
+  }
+
+  if (params->broadcast_category !=
+          BroadcastableOpCategory::kFirstInputBroadcastsFast &&
+      params->broadcast_category !=
+          BroadcastableOpCategory::kSecondInputBroadcastsFast) {
+    // This is unreachable because at least one else clause in the above loop
+    // must be reached.
+    TFLITE_DCHECK(false);
+    params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
+    return false;
+  }
+
+  // From this point it is assumed contractually that corresponding dimensions
+  // in shape0 and shape1 are either (a) equal or (b) one or other equals 1.
+  const bool swap_inputs = params->broadcast_category ==
+                           BroadcastableOpCategory::kSecondInputBroadcastsFast;
+  const RuntimeShape* shape_a =
+      swap_inputs ? &extended_shape1 : &extended_shape0;
+  const RuntimeShape* shape_b =
+      swap_inputs ? &extended_shape0 : &extended_shape1;
+
+  int i = dims_count - 1;
+  params->broadcast_shape[0] = 1;
+  params->broadcast_shape[1] = 1;
+  params->broadcast_shape[2] = 1;
+  params->broadcast_shape[3] = 1;
+  params->broadcast_shape[4] = 1;
+  // y_0 is greedy: include dims if both or neither equal 1: in other words,
+  // test for equality rather than (shape_a->Dims(i) != 1).
+  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
+    params->broadcast_shape[4] *= shape_b->Dims(i);
+    --i;
+  }
+  // Here either input_a or input_b has dim of 1 (if i >= 0).  If it is input_b
+  // that has the unit dimension, the next two loops are not entered.
+  while (i >= 0 && shape_a->Dims(i) == 1) {
+    params->broadcast_shape[3] *= shape_b->Dims(i);
+    --i;
+  }
+  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
+    params->broadcast_shape[2] *= shape_a->Dims(i);
+    --i;
+  }
+  // Here either input_a or input_b has dim of 1 (if i >= 0).
+  while (i >= 0 && shape_b->Dims(i) == 1) {
+    params->broadcast_shape[1] *= shape_a->Dims(i);
+    --i;
+  }
+  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
+    params->broadcast_shape[0] *= shape_b->Dims(i);
+    --i;
+  }
+
+  // Rarer case is when the broadcast dimensions cannot be handled by a fivefold
+  // loop.
+  if (i >= 0) {
+    params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
+  }
+  return true;
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PROCESS_BROADCAST_SHAPES_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/quantize.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/quantize.h
@@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_
+
+#include <algorithm>
+#include <limits>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+template <typename InputT, typename OutputT>
+inline void AffineQuantize(const tflite::QuantizationParams& op_params,
+                           const RuntimeShape& input_shape,
+                           const InputT* input_data,
+                           const RuntimeShape& output_shape,
+                           OutputT* output_data) {
+  const int32_t zero_point = op_params.zero_point;
+  const double scale = op_params.scale;
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  static constexpr int32_t min_val = std::numeric_limits<OutputT>::min();
+  static constexpr int32_t max_val = std::numeric_limits<OutputT>::max();
+
+  for (int i = 0; i < flat_size; i++) {
+    const InputT val = input_data[i];
+    int32_t unclamped =
+        static_cast<int32_t>(TfLiteRound(val / static_cast<float>(scale))) +
+        zero_point;
+    int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
+    output_data[i] = clamped;
+  }
+}
+
+}  // namespace reference_ops
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/reduce.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/reduce.h
@@ -0,0 +1,405 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/max.h"
+#include "tensorflow/lite/kernels/internal/min.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+// A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
+// This method iterates through input data and reduce elements along the
+// dimensions given in axis.
+template <typename In, typename Out>
+inline bool Reduce(const In* input_data, const int* input_dims,
+                   const int* output_dims, const int input_num_dims,
+                   const int output_num_dims, const int* axis,
+                   const int num_axis, int* input_iter,
+                   Out reducer(const Out current, const In in),
+                   Out* output_data) {
+  // Reset input iterator.
+  for (int idx = 0; idx < input_num_dims; ++idx) {
+    input_iter[idx] = 0;
+  }
+  // Iterate through input_data.
+  do {
+    size_t input_offset =
+        ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
+    size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims,
+                                               input_iter, num_axis, axis);
+    output_data[output_offset] =
+        reducer(output_data[output_offset], input_data[input_offset]);
+  } while (NextIndex(input_num_dims, input_dims, input_iter));
+  return true;
+}
+
+// This method parses the input 'axis' to remove duplicates and handle negative
+// values, and returns a valid 'out_axis'
+inline bool ResolveAxis(const int num_dims, const int* axis,
+                        const int64_t num_axis, int* out_axis,
+                        int* out_num_axis) {
+  *out_num_axis = 0;  // Just in case.
+  // Short-circuit axis resolution for scalars; the axis will go unused.
+  if (num_dims == 0) {
+    return true;
+  }
+  // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
+  for (int64_t idx = 0; idx < num_axis; ++idx) {
+    // Handle negative index. A positive index 'p_idx' can be represented as a
+    // negative index 'n_idx' as: n_idx = p_idx-num_dims
+    // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1]  */
+    int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
+    TFLITE_DCHECK(current >= 0 && current < num_dims);
+    bool is_dup = false;
+    for (int j = 0; j < *out_num_axis; ++j) {
+      if (out_axis[j] == current) {
+        is_dup = true;
+        break;
+      }
+    }
+    if (!is_dup) {
+      out_axis[*out_num_axis] = current;
+      *out_num_axis += 1;
+    }
+  }
+  return true;
+}
+
+// This method expects that output_data has been initialized.
+template <typename In, typename Out>
+inline bool ReduceSumImpl(const In* input_data, const int* input_dims,
+                          const int* output_dims, const int input_num_dims,
+                          const int output_num_dims, const int* axis,
+                          const int num_axis, int* input_iter,
+                          Out* output_data) {
+  auto reducer = [](const Out current, const In in) -> Out {
+    const Out actual_in = static_cast<Out>(in);
+    return current + actual_in;
+  };
+  return Reduce<In, Out>(input_data, input_dims, output_dims, input_num_dims,
+                         output_num_dims, axis, num_axis, input_iter, reducer,
+                         output_data);
+}
+
+template <typename T>
+inline bool InitTensorDataForReduce(const int* dims, const int num_dims,
+                                    const T init_value, T* data) {
+  size_t num_elements = 1;
+  for (int idx = 0; idx < num_dims; ++idx) {
+    size_t current = static_cast<size_t>(dims[idx]);
+    // Overflow prevention.
+    if (num_elements > std::numeric_limits<size_t>::max() / current) {
+      return false;
+    }
+    num_elements *= current;
+  }
+  for (size_t idx = 0; idx < num_elements; ++idx) {
+    data[idx] = init_value;
+  }
+  return true;
+}
+
+// Computes the generic value (i.e., sum/max/min/prod) of elements across
+// dimensions given in axis. It needs to pass in init_value and reducer.
+template <typename T>
+inline bool ReduceGeneric(const T* input_data, const int* input_dims,
+                          const int input_num_dims, T* output_data,
+                          const int* output_dims, const int output_num_dims,
+                          const int* axis, const int64_t num_axis_dimensions,
+                          bool keep_dims, int* temp_index, int* resolved_axis,
+                          T init_value,
+                          T reducer(const T current, const T in)) {
+  // Reset output data.
+  if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value,
+                               output_data)) {
+    return false;
+  }
+
+  // Resolve axis.
+  int num_resolved_axis = 0;
+  if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
+                   &num_resolved_axis)) {
+    return false;
+  }
+
+  return Reduce<T, T>(input_data, input_dims, output_dims, input_num_dims,
+                      output_num_dims, resolved_axis, num_resolved_axis,
+                      temp_index, reducer, output_data);
+}
+
+// Computes the mean of elements across dimensions given in axis.
+// It does so in two stages, first calculates the sum of elements along the axis
+// then divides it by the number of element in axis.
+template <typename T, typename U>
+inline bool Mean(const T* input_data, const int* input_dims,
+                 const int input_num_dims, T* output_data,
+                 const int* output_dims, const int output_num_dims,
+                 const int* axis, const int num_axis_dimensions, bool keep_dims,
+                 int* temp_index, int* resolved_axis, U* temp_sum) {
+  ruy::profiler::ScopeLabel label("Mean");
+  // Reset output data.
+  size_t num_outputs = 1;
+  for (int idx = 0; idx < output_num_dims; ++idx) {
+    size_t current = static_cast<size_t>(output_dims[idx]);
+    // Overflow prevention.
+    if (num_outputs > std::numeric_limits<size_t>::max() / current) {
+      return false;
+    }
+    num_outputs *= current;
+  }
+  for (size_t idx = 0; idx < num_outputs; ++idx) {
+    output_data[idx] = T();
+    temp_sum[idx] = U();
+  }
+
+  // Resolve axis.
+  int num_resolved_axis = 0;
+  if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
+                   &num_resolved_axis)) {
+    return false;
+  }
+
+  if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
+                           output_num_dims, resolved_axis, num_resolved_axis,
+                           temp_index, temp_sum)) {
+    return false;
+  }
+
+  // Calculate mean by dividing output_data by num of aggregated element.
+  U num_elements_in_axis = 1;
+  for (int idx = 0; idx < num_resolved_axis; ++idx) {
+    size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
+    // Overflow prevention.
+    if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
+      return false;
+    }
+    num_elements_in_axis *= current;
+  }
+
+  if (num_elements_in_axis > 0) {
+    for (size_t idx = 0; idx < num_outputs; ++idx) {
+      output_data[idx] =
+          static_cast<T>(temp_sum[idx] / static_cast<U>(num_elements_in_axis));
+    }
+  }
+  return true;
+}
+
+template <typename T>
+inline void Mean(const tflite::MeanParams& op_params,
+                 const RuntimeShape& unextended_input_shape,
+                 const T* input_data,
+                 const RuntimeShape& unextended_output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("Mean4D");
+
+  // Current implementation only supports dimension equals 4 and simultaneous
+  // reduction over width and height.
+  TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  const int output_batch = output_shape.Dims(0);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int output_depth = output_shape.Dims(3);
+
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+
+  TFLITE_CHECK_EQ(op_params.axis_count, 2);
+  TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+               (op_params.axis[0] == 2 && op_params.axis[1] == 1));
+  TFLITE_CHECK_EQ(output_height, 1);
+  TFLITE_CHECK_EQ(output_width, 1);
+
+  for (int out_b = 0; out_b < output_batch; ++out_b) {
+    for (int out_d = 0; out_d < output_depth; ++out_d) {
+      float value = 0;
+      for (int in_h = 0; in_h < input_height; ++in_h) {
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          value += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
+        }
+      }
+      output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
+          value / (input_width * input_height);
+    }
+  }
+}
+
+inline void Mean(const tflite::MeanParams& op_params,
+                 const RuntimeShape& unextended_input_shape,
+                 const uint8_t* input_data, int32_t input_zero_point,
+                 float input_scale, const RuntimeShape& unextended_output_shape,
+                 uint8_t* output_data, int32_t output_zero_point,
+                 float output_scale) {
+  ruy::profiler::ScopeLabel label("Mean4D/Uint8");
+
+  // Current implementation only supports dimension equals 4 and simultaneous
+  // reduction over width and height.
+  TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+  const int output_batch = output_shape.Dims(0);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int output_depth = output_shape.Dims(3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const float num_elements_in_axis = input_width * input_height;
+
+  TFLITE_CHECK_EQ(op_params.axis_count, 2);
+  TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+               (op_params.axis[0] == 2 && op_params.axis[1] == 1));
+  TFLITE_CHECK_EQ(output_height, 1);
+  TFLITE_CHECK_EQ(output_width, 1);
+
+  constexpr int32_t kMinValue = std::numeric_limits<uint8_t>::min();
+  constexpr int32_t kMaxValue = std::numeric_limits<uint8_t>::max();
+
+  int32_t bias =
+      output_zero_point -
+      static_cast<int32_t>(input_zero_point * input_scale / output_scale);
+  double real_scale =
+      static_cast<double>(input_scale / (num_elements_in_axis * output_scale));
+
+  int32_t multiplier;
+  int shift;
+  QuantizeMultiplier(real_scale, &multiplier, &shift);
+  for (int out_b = 0; out_b < output_batch; ++out_b) {
+    for (int out_d = 0; out_d < output_depth; ++out_d) {
+      int32_t acc = 0;
+      for (int in_h = 0; in_h < input_height; ++in_h) {
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
+        }
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
+      acc += bias;
+      acc = std::min(std::max(acc, kMinValue), kMaxValue);
+      output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
+          static_cast<uint8_t>(acc);
+    }
+  }
+}
+
+// Computes the mean of elements across dimensions given in axis.
+// It does so in two stages, first calculates the sum of elements along the axis
+// then divides it by the number of element in axis for quantized values.
+template <typename T, typename U>
+inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
+                               float input_scale, const int* input_dims,
+                               const int input_num_dims, T* output_data,
+                               int32_t output_zero_point, float output_scale,
+                               const int* output_dims,
+                               const int output_num_dims, const int* axis,
+                               const int num_axis_dimensions, bool keep_dims,
+                               int* temp_index, int* resolved_axis, U* temp_sum,
+                               bool compute_sum) {
+  const bool uint8_case = std::is_same<T, uint8_t>::value;
+  const bool int16_case = std::is_same<T, int16_t>::value;
+  if (uint8_case) {
+    ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Uint8" : "Mean/Uint8");
+  } else if (int16_case) {
+    ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int16" : "Mean/Int16");
+  } else {
+    ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int8" : "Mean/Int8");
+  }
+  // Reset output data.
+  size_t num_outputs = 1;
+  for (int idx = 0; idx < output_num_dims; ++idx) {
+    size_t current = static_cast<size_t>(output_dims[idx]);
+    // Overflow prevention.
+    if (num_outputs > std::numeric_limits<size_t>::max() / current) {
+      return false;
+    }
+    num_outputs *= current;
+  }
+  for (size_t idx = 0; idx < num_outputs; ++idx) {
+    output_data[idx] = T();
+    temp_sum[idx] = U();
+  }
+
+  // Resolve axis.
+  int num_resolved_axis = 0;
+  if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
+                   &num_resolved_axis)) {
+    return false;
+  }
+
+  if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
+                           output_num_dims, resolved_axis, num_resolved_axis,
+                           temp_index, temp_sum)) {
+    return false;
+  }
+
+  // Calculate mean by dividing output_data by num of aggregated element.
+  U num_elements_in_axis = 1;
+  for (int idx = 0; idx < num_resolved_axis; ++idx) {
+    size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
+    // Overflow prevention.
+    if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
+      return false;
+    }
+    num_elements_in_axis *= current;
+  }
+
+  if (num_elements_in_axis > 0) {
+    const float scale = input_scale / output_scale;
+    if (compute_sum) {
+      // TODO(b/116341117): Eliminate float and do this completely in 8bit.
+      const float bias =
+          -input_zero_point * scale * num_elements_in_axis + 0.5f;
+      for (size_t idx = 0; idx < num_outputs; ++idx) {
+        const U value =
+            static_cast<U>(TfLiteRound(temp_sum[idx] * scale + bias)) +
+            output_zero_point;
+        output_data[idx] = static_cast<T>(value);
+      }
+    } else {
+      const float bias = -input_zero_point * scale + 0.5f;
+      for (size_t idx = 0; idx < num_outputs; ++idx) {
+        float float_mean = static_cast<float>(temp_sum[idx]) /
+                           static_cast<float>(num_elements_in_axis);
+        float result = TfLiteMin(
+            TfLiteRound(float_mean * scale + bias) + output_zero_point,
+            static_cast<float>(std::numeric_limits<T>::max()));
+        result = TfLiteMax(result,
+                           static_cast<float>(std::numeric_limits<T>::min()));
+        output_data[idx] = static_cast<T>(result);
+      }
+    }
+  }
+  return true;
+}
+
+}  // namespace reference_ops
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/requantize.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/requantize.h
@@ -0,0 +1,67 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REQUANTIZE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REQUANTIZE_H_
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename input_type, typename output_type>
+inline void Requantize(const input_type* input_data, int32_t size,
+                       int32_t effective_scale_multiplier,
+                       int32_t effective_scale_shift, int32_t input_zeropoint,
+                       int32_t output_zeropoint, output_type* output_data) {
+  ruy::profiler::ScopeLabel label("Requantize");
+  const bool same_scale =
+      (effective_scale_multiplier == 1 << 30 && effective_scale_shift == 1);
+  if (same_scale) {
+    const bool mixed_type_int8_uint8 =
+        std::is_same<input_type, int8_t>::value &&
+        std::is_same<output_type, uint8_t>::value;
+    const bool mixed_type_uint8_int8 =
+        std::is_same<input_type, uint8_t>::value &&
+        std::is_same<output_type, int8_t>::value;
+    const int32_t zero_point_diff = input_zeropoint - output_zeropoint;
+    // Fast path to do requantization for the case when just a shift of 128 is
+    // needed.
+    if ((mixed_type_int8_uint8 && zero_point_diff == -128) ||
+        (mixed_type_uint8_int8 && zero_point_diff == 128)) {
+      for (int i = 0; i < size; ++i) {
+        output_data[i] = input_data[i] ^ 0x80;
+      }
+    }
+  }
+  static constexpr int32_t kMinOutput = std::numeric_limits<output_type>::min();
+  static constexpr int32_t kMaxOutput = std::numeric_limits<output_type>::max();
+  for (int i = 0; i < size; ++i) {
+    const int32_t input = input_data[i] - input_zeropoint;
+    const int32_t output =
+        MultiplyByQuantizedMultiplier(input, effective_scale_multiplier,
+                                      effective_scale_shift) +
+        output_zeropoint;
+    const int32_t clamped_output =
+        std::max(std::min(output, kMaxOutput), kMinOutput);
+    output_data[i] = static_cast<output_type>(clamped_output);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REQUANTIZE_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
@@ -0,0 +1,101 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_NEAREST_NEIGHBOR_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_NEAREST_NEIGHBOR_H_
+
+#include <cmath>
+
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline int32_t GetNearestNeighbor(const int input_value,
+                                  const int32_t input_size,
+                                  const int32_t output_size,
+                                  const bool align_corners,
+                                  const bool half_pixel_centers) {
+  const float scale =
+      (align_corners && output_size > 1)
+          ? (input_size - 1) / static_cast<float>(output_size - 1)
+          : input_size / static_cast<float>(output_size);
+  const float offset = half_pixel_centers ? 0.5f : 0.0f;
+  int32_t output_value = std::min(
+      align_corners
+          ? static_cast<int32_t>(TfLiteRound((input_value + offset) * scale))
+          : static_cast<int32_t>(std::floor((input_value + offset) * scale)),
+      input_size - 1);
+  if (half_pixel_centers) {
+    output_value = std::max(static_cast<int32_t>(0), output_value);
+  }
+  return output_value;
+}
+
+template <typename T>
+inline void ResizeNearestNeighbor(
+    const tflite::ResizeNearestNeighborParams& op_params,
+    const RuntimeShape& unextended_input_shape, const T* input_data,
+    const RuntimeShape& output_size_shape, const int32_t* output_size_data,
+    const RuntimeShape& unextended_output_shape, T* output_data) {
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
+  int32_t input_height = input_shape.Dims(1);
+  int32_t input_width = input_shape.Dims(2);
+  int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
+
+  // The Tensorflow version of this op allows resize on the width and height
+  // axis only.
+  TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
+  int32_t output_height = output_size_data[0];
+  int32_t output_width = output_size_data[1];
+
+  const int col_offset = input_shape.Dims(3);
+  const int row_offset = input_shape.Dims(2) * col_offset;
+  const int batch_offset = input_shape.Dims(1) * row_offset;
+
+  const T* input_ptr = input_data;
+  T* output_ptr = output_data;
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < output_height; ++y) {
+      int32_t in_y = GetNearestNeighbor(y, input_height, output_height,
+                                        op_params.align_corners,
+                                        op_params.half_pixel_centers);
+      const T* y_input_ptr = input_ptr + in_y * row_offset;
+      for (int x = 0; x < output_width; ++x) {
+        int32_t in_x = GetNearestNeighbor(x, input_width, output_width,
+                                          op_params.align_corners,
+                                          op_params.half_pixel_centers);
+        const T* x_input_ptr = y_input_ptr + in_x * col_offset;
+        memcpy(output_ptr, x_input_ptr, depth * sizeof(T));
+        output_ptr += depth;
+      }
+    }
+    input_ptr += batch_offset;
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_RESIZE_NEAREST_NEIGHBOR_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/round.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/round.h
@@ -0,0 +1,51 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ROUND_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ROUND_H_
+
+#include <cmath>
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline float RoundToNearest(float value) {
+  auto floor_val = std::floor(value);
+  auto diff = value - floor_val;
+  if ((diff < 0.5f) ||
+      ((diff == 0.5f) && (static_cast<int>(floor_val) % 2 == 0))) {
+    return floor_val;
+  } else {
+    return floor_val = floor_val + 1.0f;
+  }
+}
+
+inline void Round(const RuntimeShape& input_shape, const float* input_data,
+                  const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    // Note that this implementation matches that of tensorFlow tf.round
+    // and corresponds to the bankers rounding method.
+    // cfenv (for fesetround) is not yet supported universally on Android, so
+    // using a work around.
+    output_data[i] = RoundToNearest(input_data[i]);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ROUND_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/softmax.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/softmax.h
@@ -0,0 +1,228 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
+
+#include <limits>
+#include <vector>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void Softmax(const SoftmaxParams& params,
+                    const RuntimeShape& input_shape, const float* input_data,
+                    const RuntimeShape& output_shape, float* output_data) {
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i) {
+    // Find max element value which we'll use to ensure numerical stability
+    // taking advantage of the following equality:
+    // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
+    float max = std::numeric_limits<float>::lowest();
+    for (int c = 0; c < depth; ++c) {
+      max = std::max(max, input_data[i * depth + c]);
+    }
+
+    // Compute sum.
+    float sum = 0.f;
+    for (int c = 0; c < depth; ++c) {
+      sum += std::exp((input_data[i * depth + c] - max) *
+                      static_cast<float>(params.beta));
+    }
+
+    // Compute result.
+    for (int c = 0; c < depth; ++c) {
+      output_data[i * depth + c] = std::exp((input_data[i * depth + c] - max) *
+                                            static_cast<float>(params.beta)) /
+                                   sum;
+    }
+  }
+}
+
+// Quantized softmax with int8_t/uint8_t input and int8_t/uint8_t/int16_t
+// output.
+template <typename InputT, typename OutputT>
+inline void Softmax(const SoftmaxParams& params,
+                    const RuntimeShape& input_shape, const InputT* input_data,
+                    const RuntimeShape& output_shape, OutputT* output_data) {
+  const int32_t input_beta_multiplier = params.input_multiplier;
+  const int32_t input_beta_left_shift = params.input_left_shift;
+  const int diff_min = params.diff_min;
+  // The representation chosen for the input to the exp() function is Q5.26.
+  // We need to leave extra space since values that we skip might be as large as
+  // -32 before multiplying by input_beta_multiplier, and therefore as large as
+  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
+  // accumulation, but exp(-16) definitely is.
+  static const int kScaledDiffIntegerBits = 5;
+  static const int kAccumulationIntegerBits = 12;
+  using FixedPointScaledDiff =
+      gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
+  using FixedPointAccum =
+      gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
+
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i) {
+    InputT max_in_row = std::numeric_limits<InputT>::min();
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+    for (int c = 0; c < depth; ++c) {
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32_t input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_beta_multiplier, input_beta_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                        exp_on_negative_values(scaled_diff_f8));
+      }
+    }
+
+    int num_bits_over_unit;
+    FixedPoint0 shifted_scale = FixedPoint0::FromRaw(GetReciprocal(
+        sum_of_exps.raw(), kAccumulationIntegerBits, &num_bits_over_unit));
+
+    for (int c = 0; c < depth; ++c) {
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32_t input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_beta_multiplier, input_beta_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+
+        FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
+        int32_t unsat_output = gemmlowp::RoundingDivideByPOT(
+            (shifted_scale * exp_in_0).raw(),
+            num_bits_over_unit + 31 - (sizeof(OutputT) * 8));
+
+        const int32_t shifted_output =
+            unsat_output +
+            static_cast<int32_t>(std::numeric_limits<OutputT>::min());
+
+        output_data[i * depth + c] = static_cast<OutputT>(std::max(
+            std::min(shifted_output,
+                     static_cast<int32_t>(std::numeric_limits<OutputT>::max())),
+            static_cast<int32_t>(std::numeric_limits<OutputT>::min())));
+      } else {
+        output_data[i * depth + c] = std::numeric_limits<OutputT>::min();
+      }
+    }
+  }
+}
+
+// Quantized softmax with int16_t input and int16_t output.
+inline void SoftmaxInt16(const SoftmaxParams& params,
+                         const RuntimeShape& input_shape,
+                         const int16_t* input_data,
+                         const RuntimeShape& output_shape,
+                         int16_t* output_data) {
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i) {
+    // Find the largest element
+    int16_t max_in_row = std::numeric_limits<int16_t>::min();
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    // Compute exp(input - max_input)
+    std::vector<int16_t> exp_result_Q015(depth);
+    for (int c = 0; c < depth; ++c) {
+      int32_t input_diff = input_data[i * depth + c] - max_in_row;
+      // scale the input_diff such that [-65535, 0] correspond to [-10.0, 0.0]
+      int32_t scaled_diff = MultiplyByQuantizedMultiplier(
+          input_diff, params.input_multiplier, params.input_left_shift);
+      // recenter to [-32768, 32767]
+      int32_t sym_scaled_diff = scaled_diff + 32767;
+      int16_t sat_sym_scaled_diff =
+          std::min(std::max(sym_scaled_diff, static_cast<int32_t>(-32768)),
+                   static_cast<int32_t>(32767));
+      // apply the exp() LUT activation function
+      exp_result_Q015[c] =
+          generic_int16_table_lookup(sat_sym_scaled_diff, params.exp_lut);
+    }
+
+    // sum_of_exps is a Q16.15 fixed point format.
+    int32_t sum_of_exps = 0;
+    for (int c = 0; c < depth; ++c) {
+      // Q16.15 + Q0.15
+      sum_of_exps += exp_result_Q015[c];
+    }
+
+    // Compute the reciprocal 1/sum_of_exps
+    uint8_t headroom_plus_one =
+        CountLeadingZeros(static_cast<uint32_t>(sum_of_exps));
+    int32_t shifted_sum =
+        ((static_cast<int64_t>(sum_of_exps) << (headroom_plus_one - 1)) +
+         (1 << 13)) >>
+        14;
+    // since the LUT computes 1/(1 + x) we need to first compute x = (sum - 1).
+    // also, the LUT expects a symmetrical input, so we must also recenter x
+    // from [0, 65535] to [-32768, 32767].
+    int32_t sym_shifted_sum = shifted_sum + (-((1 << 15) + (1 << 16)));
+    int16_t sat_sym_shifted_sum = static_cast<int16_t>(
+        std::min(std::max(sym_shifted_sum, static_cast<int32_t>(-32768)),
+                 static_cast<int32_t>(32767)));
+    // apply 1/(1 + x) LUT activation function
+    int16_t reciprocal_scale_Q015 = generic_int16_table_lookup(
+        sat_sym_shifted_sum, params.one_over_one_plus_x_lut);
+
+    // Rescale the exp_result with reciprocal
+    // range of output is [0, 32767] correspond to [0.0, 1.0]
+    for (int c = 0; c < depth; ++c) {
+      uint8_t right_shift = 31 - headroom_plus_one;
+      int64_t round = 1 << (right_shift - 1);
+      int32_t result = (static_cast<int64_t>(exp_result_Q015[c]) *
+                            static_cast<int64_t>(reciprocal_scale_Q015) +
+                        round) >>
+                       right_shift;
+      output_data[i * depth + c] = static_cast<int16_t>(
+          std::min(std::max(result, static_cast<int32_t>(0)),
+                   static_cast<int32_t>(32767)));
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/strided_slice.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/strided_slice.h
@@ -0,0 +1,94 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+template <typename T>
+inline void StridedSlice(const tflite::StridedSliceParams& op_params,
+                         const RuntimeShape& unextended_input_shape,
+                         const T* input_data,
+                         const RuntimeShape& unextended_output_shape,
+                         T* output_data) {
+  using strided_slice::LoopCondition;
+  using strided_slice::StartForAxis;
+  using strided_slice::StopForAxis;
+  // Note that the output_shape is not used herein.
+  tflite::StridedSliceParams params_copy = op_params;
+
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 5);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 5);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(5, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(5, unextended_output_shape);
+
+  // Reverse and pad to 5 dimensions because that is what the runtime code
+  // requires (ie. all shapes must be 5D and are given backwards).
+  strided_slice::StridedSlicePadIndices(&params_copy, 5);
+
+  const int start_0 = StartForAxis(params_copy, input_shape, 0);
+  const int stop_0 = StopForAxis(params_copy, input_shape, 0, start_0);
+  const int start_1 = StartForAxis(params_copy, input_shape, 1);
+  const int stop_1 = StopForAxis(params_copy, input_shape, 1, start_1);
+  const int start_2 = StartForAxis(params_copy, input_shape, 2);
+  const int stop_2 = StopForAxis(params_copy, input_shape, 2, start_2);
+  const int start_3 = StartForAxis(params_copy, input_shape, 3);
+  const int stop_3 = StopForAxis(params_copy, input_shape, 3, start_3);
+  const int start_4 = StartForAxis(params_copy, input_shape, 4);
+  const int stop_4 = StopForAxis(params_copy, input_shape, 4, start_4);
+
+  T* out_ptr = output_data;
+  for (int offset_0 = start_0 * input_shape.Dims(1),
+           end_0 = stop_0 * input_shape.Dims(1),
+           step_0 = params_copy.strides[0] * input_shape.Dims(1);
+       !LoopCondition(offset_0, end_0, params_copy.strides[0]);
+       offset_0 += step_0) {
+    for (int offset_1 = (offset_0 + start_1) * input_shape.Dims(2),
+             end_1 = (offset_0 + stop_1) * input_shape.Dims(2),
+             step_1 = params_copy.strides[1] * input_shape.Dims(2);
+         !LoopCondition(offset_1, end_1, params_copy.strides[1]);
+         offset_1 += step_1) {
+      for (int offset_2 = (offset_1 + start_2) * input_shape.Dims(3),
+               end_2 = (offset_1 + stop_2) * input_shape.Dims(3),
+               step_2 = params_copy.strides[2] * input_shape.Dims(3);
+           !LoopCondition(offset_2, end_2, params_copy.strides[2]);
+           offset_2 += step_2) {
+        for (int offset_3 = (offset_2 + start_3) * input_shape.Dims(4),
+                 end_3 = (offset_2 + stop_3) * input_shape.Dims(4),
+                 step_3 = params_copy.strides[3] * input_shape.Dims(4);
+             !LoopCondition(offset_3, end_3, params_copy.strides[3]);
+             offset_3 += step_3) {
+          for (int offset_4 = offset_3 + start_4, end_4 = offset_3 + stop_4;
+               !LoopCondition(offset_4, end_4, params_copy.strides[4]);
+               offset_4 += params_copy.strides[4]) {
+            *out_ptr++ = input_data[offset_4];
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/sub.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/sub.h
@@ -0,0 +1,516 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <limits>
+
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline void SubNonBroadcast(const ArithmeticParams& params,
+                            const RuntimeShape& input1_shape,
+                            const float* input1_data,
+                            const RuntimeShape& input2_shape,
+                            const float* input2_data,
+                            const RuntimeShape& output_shape,
+                            float* output_data) {
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] - input2_data[i], params.float_activation_min,
+        params.float_activation_max);
+  }
+}
+
+inline void SubNonBroadcast(const ArithmeticParams& params,
+                            const RuntimeShape& input1_shape,
+                            const int32_t* input1_data,
+                            const RuntimeShape& input2_shape,
+                            const int32_t* input2_data,
+                            const RuntimeShape& output_shape,
+                            int32_t* output_data) {
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] - input2_data[i], params.quantized_activation_min,
+        params.quantized_activation_max);
+  }
+}
+
+// TODO(b/151345304): We can implement BroadcastSub on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO(b/151345101): BroadcastSub is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+template <int N = 5>
+inline void BroadcastSubSlow(const ArithmeticParams& params,
+                             const RuntimeShape& input1_shape,
+                             const float* input1_data,
+                             const RuntimeShape& input2_shape,
+                             const float* input2_data,
+                             const RuntimeShape& output_shape,
+                             float* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/float");
+  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
+  NdArrayDesc<N> desc1;
+  NdArrayDesc<N> desc2;
+  NdArrayDesc<N> output_desc;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  auto sub_func = [&](int indexes[N]) {
+    output_data[SubscriptToIndex(output_desc, indexes)] =
+        ActivationFunctionWithMinMax(
+            input1_data[SubscriptToIndex(desc1, indexes)] -
+                input2_data[SubscriptToIndex(desc2, indexes)],
+            params.float_activation_min, params.float_activation_max);
+  };
+  NDOpsHelper<N>(output_desc, sub_func);
+}
+
+template <int N = 5>
+inline void BroadcastSubSlow(const ArithmeticParams& params,
+                             const RuntimeShape& input1_shape,
+                             const uint8_t* input1_data,
+                             const RuntimeShape& input2_shape,
+                             const uint8_t* input2_data,
+                             const RuntimeShape& output_shape,
+                             uint8_t* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/uint8_t");
+  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
+  NdArrayDesc<N> desc1;
+  NdArrayDesc<N> desc2;
+  NdArrayDesc<N> output_desc;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  auto sub_func = [&](int indexes[N]) {
+    const int32_t input1_val =
+        params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
+    const int32_t input2_val =
+        params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32_t scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
+    const int32_t raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sub, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[SubscriptToIndex(output_desc, indexes)] =
+        static_cast<uint8_t>(clamped_output);
+  };
+  NDOpsHelper<N>(output_desc, sub_func);
+}
+
+template <int N = 5>
+inline void BroadcastSubSlow(const ArithmeticParams& params,
+                             const RuntimeShape& input1_shape,
+                             const int32_t* input1_data,
+                             const RuntimeShape& input2_shape,
+                             const int32_t* input2_data,
+                             const RuntimeShape& output_shape,
+                             int32_t* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32_t");
+  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
+  NdArrayDesc<N> desc1;
+  NdArrayDesc<N> desc2;
+  NdArrayDesc<N> output_desc;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  auto sub_func = [&](int indexes[N]) {
+    output_data[SubscriptToIndex(output_desc, indexes)] =
+        ActivationFunctionWithMinMax(
+            input1_data[SubscriptToIndex(desc1, indexes)] -
+                input2_data[SubscriptToIndex(desc2, indexes)],
+            params.quantized_activation_min, params.quantized_activation_max);
+  };
+  NDOpsHelper<N>(output_desc, sub_func);
+}
+
+template <int N = 5>
+inline void BroadcastSubSlow(const ArithmeticParams& params,
+                             const RuntimeShape& input1_shape,
+                             const int8_t* input1_data,
+                             const RuntimeShape& input2_shape,
+                             const int8_t* input2_data,
+                             const RuntimeShape& output_shape,
+                             int8_t* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int8_t");
+  NdArrayDesc<N> desc1;
+  NdArrayDesc<N> desc2;
+  NdArrayDesc<N> output_desc;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  auto sub_func = [&](int indexes[N]) {
+    const int32_t input1_val =
+        params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
+    const int32_t input2_val =
+        params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32_t scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
+    const int32_t raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sub, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[SubscriptToIndex(output_desc, indexes)] =
+        static_cast<int8_t>(clamped_output);
+  };
+  NDOpsHelper<N>(output_desc, sub_func);
+}
+
+template <int N = 5>
+void BroadcastSubSlow(const ArithmeticParams& params,
+                      const RuntimeShape& input1_shape,
+                      const int64_t* input1_data,
+                      const RuntimeShape& input2_shape,
+                      const int64_t* input2_data,
+                      const RuntimeShape& output_shape, int64_t* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int64_t");
+  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
+  NdArrayDesc<N> desc1;
+  NdArrayDesc<N> desc2;
+  NdArrayDesc<N> output_desc;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  auto sub_func = [&](int indexes[N]) {
+    output_data[SubscriptToIndex(output_desc, indexes)] =
+        ActivationFunctionWithMinMax(
+            input1_data[SubscriptToIndex(desc1, indexes)] -
+                input2_data[SubscriptToIndex(desc2, indexes)],
+            params.int64_activation_min, params.int64_activation_max);
+  };
+  NDOpsHelper<N>(output_desc, sub_func);
+}
+
+template <typename T, int N = 5>
+void BroadcastSubSlow(const ArithmeticParams& params,
+                      const RuntimeShape& input1_shape, const T* input1_data,
+                      const RuntimeShape& input2_shape, const T* input2_data,
+                      const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/templated");
+  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
+  NdArrayDesc<N> desc1;
+  NdArrayDesc<N> desc2;
+  NdArrayDesc<N> output_desc;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  auto sub_func = [&](int indexes[N]) {
+    output_data[SubscriptToIndex(output_desc, indexes)] =
+        ActivationFunctionWithMinMax(
+            input1_data[SubscriptToIndex(desc1, indexes)] -
+                input2_data[SubscriptToIndex(desc2, indexes)],
+            params.quantized_activation_min, params.quantized_activation_max);
+  };
+  NDOpsHelper<N>(output_desc, sub_func);
+}
+
+// Element-wise Sub that can often be used for inner loop of broadcast sub as
+// well as the non-broadcast sub.
+inline void SubElementwise(int size, const ArithmeticParams& params,
+                           const uint8_t* input1_data,
+                           const uint8_t* input2_data, uint8_t* output_data) {
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+
+  for (int i = 0; i < size; ++i) {
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32_t scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
+    const int32_t raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sub, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<uint8_t>(clamped_output);
+  }
+}
+
+// Element-wise add that can often be used for inner loop of broadcast add as
+// well as the non-broadcast add.
+inline void SubElementwise(int size, const ArithmeticParams& params,
+                           const int8_t* input1_data, const int8_t* input2_data,
+                           int8_t* output_data) {
+  const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
+  TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
+  TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
+  TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
+  TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
+
+  for (int i = 0; i < size; ++i) {
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32_t scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
+    const int32_t raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sub, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32_t clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+inline void Sub(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+  SubElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void Sub(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int8_t* input1_data,
+                const RuntimeShape& input2_shape, const int8_t* input2_data,
+                const RuntimeShape& output_shape, int8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
+  TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
+  TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
+  TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
+  TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
+  SubElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+template <typename T>
+void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
+         const T* input1_data, const RuntimeShape& input2_shape,
+         const T* input2_data, const RuntimeShape& output_shape,
+         T* output_data) {
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+        }
+      }
+    }
+  }
+}
+
+inline void SetActivationMinMax(const ArithmeticParams& params,
+                                int32_t* activation_min,
+                                int32_t* activation_max) {
+  *activation_min = params.quantized_activation_min;
+  *activation_max = params.quantized_activation_max;
+}
+
+inline void SetActivationMinMax(const ArithmeticParams& params,
+                                float* activation_min, float* activation_max) {
+  *activation_min = params.float_activation_min;
+  *activation_max = params.float_activation_max;
+}
+
+inline void SetActivationMinMax(const ArithmeticParams& params,
+                                int64_t* activation_min,
+                                int64_t* activation_max) {
+  *activation_min = params.int64_activation_min;
+  *activation_max = params.int64_activation_max;
+}
+
+template <typename T>
+inline void SubWithActivation(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("SubWithActivation");
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  T activation_min, activation_max;
+  SetActivationMinMax(params, &activation_min, &activation_max);
+
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] - input2_data[i], activation_min, activation_max);
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/tanh.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/reference/tanh.h
@@ -0,0 +1,129 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
+
+#include <cmath>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    float val = input_data[i];
+    float result = std::tanh(val);
+    output_data[i] = result;
+  }
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// uniform between data types.
+inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
+                 const float* input_data, const RuntimeShape& output_shape,
+                 float* output_data) {
+  // Drop params: not needed.
+  Tanh(input_shape, input_data, output_shape, output_data);
+}
+
+inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
+                 const int16_t* input_data, const RuntimeShape& output_shape,
+                 int16_t* output_data) {
+  const int input_left_shift = params.input_left_shift;
+  // Support for shifts is limited until we have a parameterized version of
+  // SaturatingRoundingMultiplyByPOT().
+  TFLITE_DCHECK_GE(input_left_shift, 0);
+  TFLITE_DCHECK_LE(input_left_shift, 1);
+
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  // F0 uses 0 integer bits, range [-1, 1].
+  // This is the return type of math functions such as tanh, logistic,
+  // whose range is in [-1, 1].
+  using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+  // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+  using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+
+  if (input_left_shift == 0) {
+    for (int i = 0; i < flat_size; i++) {
+      F3 input = F3::FromRaw(input_data[i]);
+      F0 output = gemmlowp::tanh(input);
+      output_data[i] = output.raw();
+    }
+  } else {
+    for (int i = 0; i < flat_size; i++) {
+      F3 input = F3::FromRaw(
+          gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data[i]));
+      F0 output = gemmlowp::tanh(input);
+      output_data[i] = output.raw();
+    }
+  }
+}
+
+inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
+                 const uint8_t* input_data, const RuntimeShape& output_shape,
+                 uint8_t* output_data) {
+  const int32_t input_zero_point = params.input_zero_point;
+  const int32_t input_range_radius = params.input_range_radius;
+  const int32_t input_multiplier = params.input_multiplier;
+  const int input_left_shift = params.input_left_shift;
+  const int32_t output_zero_point = 128;
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    const uint8_t input_val_u8 = input_data[i];
+    const int32_t input_val_centered =
+        static_cast<int32_t>(input_val_u8) - input_zero_point;
+    uint8_t output_val;
+    if (input_val_centered <= -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered >= input_range_radius) {
+      output_val = 255;
+    } else {
+      const int32_t input_val_rescaled =
+          MultiplyByQuantizedMultiplierGreaterThanOne(
+              input_val_centered, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32_t, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
+      // Convert from Q0.31 to Q24.7.
+      using gemmlowp::RoundingDivideByPOT;
+      int32_t output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
+      output_val_s32 += output_zero_point;
+      if (output_val_s32 == 256) {
+        output_val_s32 = 255;
+      }
+      // Reinterpret as Q0.7, encoded in uint8_t.
+      TFLITE_DCHECK_GE(output_val_s32, 0);
+      TFLITE_DCHECK_LE(output_val_s32, 255);
+      output_val = static_cast<uint8_t>(output_val_s32);
+    }
+    output_data[i] = output_val;
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/strided_slice_logic.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/strided_slice_logic.h
@@ -0,0 +1,204 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
+
+#include <limits>
+#include <vector>
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace strided_slice {
+
+// Use until std::clamp() is available from C++17.
+inline int Clamp(const int v, const int lo, const int hi) {
+  TFLITE_DCHECK(!(hi < lo));
+  if (hi < v) return hi;
+  if (v < lo) return lo;
+  return v;
+}
+
+inline void StridedSlicePadIndices(tflite::StridedSliceParams* p,
+                                   int dim_count) {
+  // Add indices and mask bits to fully include extra dimensions
+  TFLITE_CHECK_LE(dim_count, 5);
+  TFLITE_CHECK_GE(dim_count, p->start_indices_count);
+  TFLITE_CHECK_EQ(p->start_indices_count, p->stop_indices_count);
+  TFLITE_CHECK_EQ(p->stop_indices_count, p->strides_count);
+
+  const int pad_count = dim_count - p->start_indices_count;
+
+  // Pad indices at start, so move arrays by pad_count.
+  for (int i = p->start_indices_count - 1; i >= 0; --i) {
+    p->strides[i + pad_count] = p->strides[i];
+    p->start_indices[i + pad_count] = p->start_indices[i];
+    p->stop_indices[i + pad_count] = p->stop_indices[i];
+  }
+  for (int i = 0; i < pad_count; ++i) {
+    p->start_indices[i] = 0;
+    p->stop_indices[i] = 1;
+    p->strides[i] = 1;
+  }
+
+  // Pad masks with 0s or 1s as required.
+  p->shrink_axis_mask <<= pad_count;
+  p->ellipsis_mask <<= pad_count;
+  p->new_axis_mask <<= pad_count;
+  p->begin_mask <<= pad_count;
+  p->end_mask <<= pad_count;
+  p->begin_mask |= (1 << pad_count) - 1;
+  p->end_mask |= (1 << pad_count) - 1;
+
+  p->start_indices_count = dim_count;
+  p->stop_indices_count = dim_count;
+  p->strides_count = dim_count;
+}
+
+// Return the index for the first element along that axis. This index will be a
+// positive integer between [0, axis_size - 1] that can be used to index
+// directly into the data.
+inline int StartForAxis(const tflite::StridedSliceParams& params,
+                        const RuntimeShape& input_shape, int axis) {
+  const auto begin_mask = params.begin_mask;
+  const auto* start_indices = params.start_indices;
+  const auto* strides = params.strides;
+  const int axis_size = input_shape.Dims(axis);
+  if (axis_size == 0) {
+    return 0;
+  }
+  // Begin with the specified index.
+  int start = start_indices[axis];
+
+  // begin_mask override
+  if (begin_mask & 1 << axis) {
+    if (strides[axis] > 0) {
+      // Forward iteration - use the first element. These values will get
+      // clamped below (Note: We could have set them to 0 and axis_size-1, but
+      // use lowest() and max() to maintain symmetry with StopForAxis())
+      start = std::numeric_limits<int>::lowest();
+    } else {
+      // Backward iteration - use the last element.
+      start = std::numeric_limits<int>::max();
+    }
+  }
+
+  // Handle negative indices
+  if (start < 0) {
+    start += axis_size;
+  }
+
+  // Clamping
+  start = Clamp(start, 0, axis_size - 1);
+
+  return start;
+}
+
+// Return the "real" index for the end of iteration along that axis. This is an
+// "end" in the traditional C sense, in that it points to one past the last
+// element. ie. So if you were iterating through all elements of a 1D array of
+// size 4, this function would return 4 as the stop, because it is one past the
+// "real" indices of 0, 1, 2 & 3.
+inline int StopForAxis(const tflite::StridedSliceParams& params,
+                       const RuntimeShape& input_shape, int axis,
+                       int start_for_axis) {
+  const auto end_mask = params.end_mask;
+  const auto shrink_axis_mask = params.shrink_axis_mask;
+  const auto* stop_indices = params.stop_indices;
+  const auto* strides = params.strides;
+  const int axis_size = input_shape.Dims(axis);
+  if (axis_size == 0) {
+    return 0;
+  }
+
+  // Begin with the specified index
+  const bool shrink_axis = shrink_axis_mask & (1 << axis);
+  int stop = stop_indices[axis];
+
+  // When shrinking an axis, the end position does not matter (and can be
+  // incorrect when negative indexing is used, see Issue #19260). Always use
+  // start_for_axis + 1 to generate a length 1 slice, since start_for_axis has
+  // already been adjusted for negative indices.
+  if (shrink_axis) {
+    stop = start_for_axis + 1;
+  }
+
+  // end_mask override
+  if (end_mask & (1 << axis)) {
+    if (strides[axis] > 0) {
+      // Forward iteration - use the last element. These values will get
+      // clamped below
+      stop = std::numeric_limits<int>::max();
+    } else {
+      // Backward iteration - use the first element.
+      stop = std::numeric_limits<int>::lowest();
+    }
+  }
+
+  // Handle negative indices
+  if (stop < 0) {
+    stop += axis_size;
+  }
+
+  // Clamping
+  // Because the end index points one past the last element, we need slightly
+  // different clamping ranges depending on the direction.
+  if (strides[axis] > 0) {
+    // Forward iteration
+    stop = Clamp(stop, 0, axis_size);
+  } else {
+    // Backward iteration
+    stop = Clamp(stop, -1, axis_size - 1);
+  }
+
+  return stop;
+}
+
+inline bool LoopCondition(int index, int stop, int stride) {
+  // True when we have reached the end of an axis and should loop.
+  return stride > 0 ? index >= stop : index <= stop;
+}
+
+inline tflite::StridedSliceParams BuildStridedSliceParams(
+    int begin_mask, int end_mask, int shrink_axis_mask,
+    const std::vector<int>& start_indices, const std::vector<int>& stop_indices,
+    const std::vector<int>& strides) {
+  tflite::StridedSliceParams op_params;
+  const int dims_count = start_indices.size();
+
+  op_params.start_indices_count = dims_count;
+  op_params.stop_indices_count = dims_count;
+  op_params.strides_count = dims_count;
+  for (int i = 0; i < dims_count; ++i) {
+    op_params.start_indices[i] = start_indices[i];
+    op_params.stop_indices[i] = stop_indices[i];
+    op_params.strides[i] = strides[i];
+  }
+
+  op_params.begin_mask = begin_mask;
+  op_params.ellipsis_mask = 0;
+  op_params.end_mask = end_mask;
+  op_params.new_axis_mask = 0;
+  op_params.shrink_axis_mask = shrink_axis_mask;
+
+  return op_params;
+}
+
+}  // namespace strided_slice
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/tensor.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/tensor.h
@@ -0,0 +1,147 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
+
+#include <complex>
+#include <vector>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace tflite {
+
+inline RuntimeShape GetTensorShape(std::vector<int32_t> data) {
+  return RuntimeShape(data.size(), data.data());
+}
+
+// A list of tensors in a format that can be used by kernels like split and
+// concatenation.
+template <typename T>
+class VectorOfTensors {
+ public:
+  // Build with the tensors in 'tensor_list'.
+  VectorOfTensors(const TfLiteContext& context,
+                  const TfLiteIntArray& tensor_list) {
+    int num_tensors = tensor_list.size;
+
+    all_data_.reserve(num_tensors);
+    all_shape_.reserve(num_tensors);
+    all_shape_ptr_.reserve(num_tensors);
+
+    for (int i = 0; i < num_tensors; ++i) {
+      TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
+      all_data_.push_back(GetTensorData<T>(t));
+      all_shape_.push_back(GetTensorShape(t));
+    }
+
+    // Taking the pointer from inside a std::vector is only OK if the vector is
+    // never modified, so we populate all_shape in the previous loop and then we
+    // are free to grab iterators here.
+    for (int i = 0; i < num_tensors; ++i) {
+      all_shape_ptr_.push_back(&all_shape_[i]);
+    }
+  }
+  // Return a pointer to the data pointers of all tensors in the list. For
+  // example:
+  //   float* const* f = v.data();
+  //   f[0][1] is the second element of the first tensor.
+  T* const* data() const { return all_data_.data(); }
+
+  // Return a pointer the shape pointers of all tensors in the list. For
+  // example:
+  //   const RuntimeShape* const* d = v.dims();
+  //   dims[1] are the dimensions of the second tensor in the list.
+  const RuntimeShape* const* shapes() const { return all_shape_ptr_.data(); }
+
+ private:
+  std::vector<T*> all_data_;
+  std::vector<RuntimeShape> all_shape_;
+  std::vector<RuntimeShape*> all_shape_ptr_;
+};
+
+// A list of quantized tensors in a format that can be used by kernels like
+// split and concatenation.
+class VectorOfQuantizedTensors : public VectorOfTensors<uint8_t> {
+ public:
+  // Build with the tensors in 'tensor_list'.
+  VectorOfQuantizedTensors(const TfLiteContext& context,
+                           const TfLiteIntArray& tensor_list)
+      : VectorOfTensors<uint8_t>(context, tensor_list) {
+    for (int i = 0; i < tensor_list.size; ++i) {
+      TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
+      zero_point_.push_back(t->params.zero_point);
+      scale_.push_back(t->params.scale);
+    }
+  }
+
+  const float* scale() const { return scale_.data(); }
+  const int32_t* zero_point() const { return zero_point_.data(); }
+
+ private:
+  std::vector<int32_t> zero_point_;
+  std::vector<float> scale_;
+};
+
+// Writes randomly accessed values from `input` sequentially into `output`.
+template <typename T>
+class SequentialTensorWriter {
+ public:
+  SequentialTensorWriter(const TfLiteTensor* input, TfLiteTensor* output) {
+    input_data_ = GetTensorData<T>(input);
+    output_ptr_ = GetTensorData<T>(output);
+  }
+  SequentialTensorWriter(const T* input_data, T* output_data)
+      : input_data_(input_data), output_ptr_(output_data) {}
+
+  void Write(int position) { *output_ptr_++ = input_data_[position]; }
+  void WriteN(int position, int len) {
+    memcpy(output_ptr_, &input_data_[position], sizeof(T) * len);
+    output_ptr_ += len;
+  }
+
+ private:
+  const T* input_data_;
+  T* output_ptr_;
+};
+
+// String ops are not yet supported on platforms w/ static memory.
+#ifndef TF_LITE_STATIC_MEMORY
+template <>
+class SequentialTensorWriter<string> {
+ public:
+  SequentialTensorWriter(const TfLiteTensor* input, TfLiteTensor* output)
+      : input_(input), output_(output) {}
+  ~SequentialTensorWriter() { buffer_.WriteToTensor(output_, nullptr); }
+
+  void Write(int position) { this->WriteN(position, 1); }
+  void WriteN(int position, int len) {
+    for (int i = 0; i < len; i++) {
+      buffer_.AddString(GetString(input_, position + i));
+    }
+  }
+
+ private:
+  const TfLiteTensor* input_;
+  TfLiteTensor* output_;
+  DynamicBuffer buffer_;
+};
+#endif  // TF_LITE_STATIC_MEMORY
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/tensor_ctypes.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/tensor_ctypes.h
@@ -0,0 +1,47 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+template <typename T>
+inline T* GetTensorData(TfLiteTensor* tensor) {
+  return tensor != nullptr ? reinterpret_cast<T*>(tensor->data.raw) : nullptr;
+}
+
+template <typename T>
+inline const T* GetTensorData(const TfLiteTensor* tensor) {
+  return tensor != nullptr ? reinterpret_cast<const T*>(tensor->data.raw)
+                           : nullptr;
+}
+
+inline RuntimeShape GetTensorShape(const TfLiteTensor* tensor) {
+  if (tensor == nullptr) {
+    return RuntimeShape();
+  }
+
+  TfLiteIntArray* dims = tensor->dims;
+  const int dims_size = dims->size;
+  const int32_t* dims_data = reinterpret_cast<const int32_t*>(dims->data);
+  return RuntimeShape(dims_size, dims_data);
+}
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
--- a/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/types.h
+++ b/components/tflite_micro/STM32L496-lib/tensorflow/lite/kernels/internal/types.h