add nnom pack and example

2021-09-08 23:47:15 +08:00
parent b06e1f1386
commit fa4395d3f8
121 changed files with 53527 additions and 9093 deletions
--- a/components/ai/nnom/LICENSE
+++ b/components/ai/nnom/LICENSE
@@ -0,0 +1,203 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+   
--- a/components/ai/nnom/README.md
+++ b/components/ai/nnom/README.md
@@ -0,0 +1,39 @@
+
+# Neural Network on Microcontroller (NNoM)
+[![Build Status](https://travis-ci.com/majianjia/nnom.svg?branch=master)](https://travis-ci.com/majianjia/nnom)
+[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+[![DOI](https://zenodo.org/badge/166869630.svg)](https://zenodo.org/badge/latestdoi/166869630)
+
+NNoM is a high-level inference Neural Network library specifically for microcontrollers. 
+
+介绍详情可以参考：https://github.com/majianjia/nnom
+
+原作者提供了基于keras的训练模型方法，以及如何配置NNoM的详细文档介绍
+
+
+
+本项目提供一个基于NNoM的软件包，方便在tos上的快捷移植（测试通过平台为stm32l496ZG）
+
+mnist示例可以参考board/NUCLEO_STM32L496ZG/KEIL/nnom_mnist
+
+## 在TencentOS-tiny上的使用说明
+
+1. 在keil工程里添加components / ai / nnom中的src文件夹下的backends、core、layers三个文件夹中的全部.c文件
+2. 在keil工程中包含inc和port文件夹中的全部头文件
+3. 在nnom_port.h指定内存使用方法（测试示例中开启了 NNOM_USING_STATIC_MEMORY宏 ），若使用非静态内存方法需要将nnom_malloc(n)和nnom_free(n)定义为os本身的内存api，对tos是tos_mmheap_alloc(n)和tos_mmheap_free(n)
+4. 若使用静态内存，则需要定义static_buf[size]并使用nnom_set_static_buf(static_buf, sizeof(static_buf))函数去指定静态内存地址与大小，并根据模型需要调整静态内存大小。
+5. 编写示例函数，参考：example/nnom_mnsit中的nnom_mnist_example写法，按照需要实现系统api，比如使用tos_systick_get()去获取系统tick，从而计算推理时间。
+
+
+
+## 注意事项
+
+在keil下确认printf已经成功实现（检查microlib选项），并注意选择ARM Compiler为Use default compiler version 5
+
+
+
+## Licenses
+
+NNoM is released under Apache License 2.0 since nnom-V0.2.0. 
+License and copyright information can be found within the code.
+
--- a/components/ai/nnom/inc/layers/nnom_activation.h
+++ b/components/ai/nnom/inc/layers/nnom_activation.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_ACTIVATION_H__
+#define __NNOM_ACTIVATION_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+
+// activation layer
+typedef struct _nnom_activation_layer_t
+{
+	nnom_layer_t super;
+	nnom_activation_t *act; 
+} nnom_activation_layer_t;
+
+
+// activation with fixed q format (tanh and sigmoid)
+typedef struct _nnom_activation_fixed_q_t
+{
+	nnom_activation_t super;
+	uint8_t dec_bit;
+} nnom_activation_fixed_q_t;
+
+// leaky relu
+typedef struct _nnom_activation_leaky_relu_t
+{
+	nnom_activation_t super;
+	q7_t alpha;					// alpha is present by q0.7 format. (-128 = -1) 
+} nnom_activation_leaky_relu_t;
+
+// advance relu (full ReLU)
+typedef struct _nnom_activation_adv_relu_t
+{
+	nnom_activation_t super;
+	q7_t negative_slope;			// negative_slope is present by q0.7 format. (-128 = -1) 
+	float max;						// cap of the max value
+	float threshold;				// threshold
+} nnom_activation_adv_relu_t;
+
+// method
+nnom_status_t activation_run(nnom_layer_t* layer);
+nnom_status_t activation_free(nnom_layer_t *layer);
+
+// activation delete
+void act_delete(nnom_activation_t* act);
+
+// a direct api on tensor
+nnom_status_t act_tensor_run(nnom_activation_t* act, nnom_tensor_t* tensor);
+
+
+// Layer API
+nnom_layer_t *Activation(nnom_activation_t *act);
+nnom_layer_t *ReLU(void);
+nnom_layer_t *LeakyReLU(float alpha);
+nnom_layer_t *AdvReLU(float alpha, float max, float threshold);
+nnom_layer_t *Sigmoid(int32_t dec_bit);
+nnom_layer_t *TanH(int32_t dec_bit);
+
+// Activation API. 
+nnom_activation_t* act_relu(void);
+nnom_activation_t* act_leaky_relu(float alpha);
+nnom_activation_t* act_adv_relu(float negative_slope, float max, float threshold);
+nnom_activation_t* act_tanh(int32_t dec_bit);
+nnom_activation_t* act_sigmoid(int32_t dec_bit);
+nnom_activation_t* act_hard_tanh(int32_t dec_bit);
+nnom_activation_t* act_hard_sigmoid(int32_t dec_bit);
+
+// utils
+int32_t act_get_dec_bit(nnom_activation_type_t type, int32_t dec_bit);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_ACTIVATION_H__ */
--- a/components/ai/nnom/inc/layers/nnom_avgpool.h
+++ b/components/ai/nnom/inc/layers/nnom_avgpool.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_AVGPOOL_H__
+#define __NNOM_AVGPOOL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+#include "layers/nnom_maxpool.h"
+
+// Avg Pooling
+typedef nnom_maxpool_layer_t nnom_avgpool_layer_t;
+
+// method
+nnom_status_t avgpooling_build(nnom_layer_t *layer);
+nnom_status_t avgpool_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *avgpool_s(const nnom_pool_config_t * config);
+nnom_layer_t *AvgPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad_type);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_AVGPOOL_H__ */
--- a/components/ai/nnom/inc/layers/nnom_baselayer.h
+++ b/components/ai/nnom/inc/layers/nnom_baselayer.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_BASELAYER_H__
+#define __NNOM_BASELAYER_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+#include "layers/nnom_input.h"
+
+// method
+nnom_status_t default_build(nnom_layer_t *layer);
+nnom_status_t default_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *baselayer_s(const nnom_layer_config_t * config);
+nnom_layer_t *BaseLayer(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_BASELAYER_H__ */
--- a/components/ai/nnom/inc/layers/nnom_concat.h
+++ b/components/ai/nnom/inc/layers/nnom_concat.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_CONCAT_H__
+#define __NNOM_CONCAT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+// concatenate layer
+typedef struct _nnom_concat_layer
+{
+	nnom_layer_t super;
+	int8_t axis;
+} nnom_concat_layer_t;
+
+typedef struct _nnom_concat_config_t
+{
+	nnom_layer_config_t super;
+	int8_t axis;
+} nnom_concat_config_t;
+
+// method
+nnom_status_t concat_build(nnom_layer_t *layer);
+nnom_status_t concat_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *concat_s(const nnom_concat_config_t *config);
+nnom_layer_t *Concat(int8_t axis);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_CONCAT_H__ */
--- a/components/ai/nnom/inc/layers/nnom_conv2d.h
+++ b/components/ai/nnom/inc/layers/nnom_conv2d.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_CONV2D_H__
+#define __NNOM_CONV2D_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+// child layers parameters
+typedef struct _nnom_conv2d_layer_t
+{
+	nnom_layer_t super;
+	nnom_3d_shape_t kernel;
+	nnom_3d_shape_t stride;
+	nnom_3d_shape_t pad;
+	nnom_3d_shape_t dilation;
+	nnom_padding_t padding_type;
+	uint32_t filter_mult; 							// filter size (for conv) or multilplier (for depthwise)
+
+	nnom_tensor_t *weight; 
+	nnom_tensor_t *bias;
+
+	// test
+	nnom_qformat_param_t * output_rshift;			
+	nnom_qformat_param_t * bias_lshift;
+} nnom_conv2d_layer_t;
+
+// a machine interface for configuration
+typedef struct _nnom_conv2d_config_t
+{
+	nnom_layer_config_t super;
+	nnom_qtype_t qtype; 	//quantisation type(per channel or per layer)
+	nnom_tensor_t *weight;
+	nnom_tensor_t *bias;
+	nnom_qformat_param_t *output_shift;   
+	nnom_qformat_param_t *bias_shift;   
+	uint32_t filter_size;  
+	int8_t kernel_size[2];
+	int8_t stride_size[2];
+	int8_t padding_size[2];
+	int8_t dilation_size[2];
+	nnom_padding_t padding_type;
+} nnom_conv2d_config_t;
+
+// method
+nnom_status_t conv2d_run(nnom_layer_t *layer);
+nnom_status_t conv2d_build(nnom_layer_t *layer);
+nnom_status_t conv2d_free(nnom_layer_t *layer);
+
+// utils
+uint32_t conv_output_length(uint32_t input_length, uint32_t filter_size, nnom_padding_t padding, uint32_t stride, uint32_t dilation);
+
+// API
+nnom_layer_t *conv2d_s(const nnom_conv2d_config_t *config);
+nnom_layer_t *Conv2D(uint32_t filters, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d,  nnom_padding_t pad_type,
+					 const nnom_weight_t *w, const nnom_bias_t *b);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_CONV2D_H__ */
--- a/components/ai/nnom/inc/layers/nnom_conv2d_trans.h
+++ b/components/ai/nnom/inc/layers/nnom_conv2d_trans.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-30     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_DECONV2D_H__
+#define __NNOM_DECONV2D_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+#include "layers/nnom_conv2d.h"
+
+// child layers parameters
+typedef nnom_conv2d_layer_t nnom_conv2d_trans_layer_t;
+
+typedef nnom_conv2d_config_t nnom_conv2d_trans_config_t;
+
+// method
+nnom_status_t conv2d_trans_run(nnom_layer_t *layer);
+nnom_status_t conv2d_trans_build(nnom_layer_t *layer);
+
+// utils
+uint32_t conv_trans_output_length(uint32_t input_length, uint32_t filter_size, nnom_padding_t padding, uint32_t stride, uint32_t dilation);
+
+// API
+nnom_layer_t *conv2d_trans_s(const nnom_conv2d_config_t *config);
+nnom_layer_t *Conv2DTrans(uint32_t filters, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d,  nnom_padding_t pad_type,
+					 const nnom_weight_t *w, const nnom_bias_t *b);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_DECONV2D_H__ */
--- a/components/ai/nnom/inc/layers/nnom_cropping.h
+++ b/components/ai/nnom/inc/layers/nnom_cropping.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_CROPPING_H__
+#define __NNOM_CROPPING_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+#include "layers/nnom_zero_padding.h"
+
+// Cropping, same as zeropadding
+typedef nnom_zero_padding_layer_t nnom_cropping_layer_t;
+
+typedef nnom_zero_padding_config_t nnom_cropping_config_t;
+
+// method
+nnom_status_t cropping_build(nnom_layer_t *layer);
+nnom_status_t cropping_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t * cropping_s(const nnom_cropping_config_t *config);
+nnom_layer_t *Cropping(nnom_border_t pad);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_CROPPING_H__ */
--- a/components/ai/nnom/inc/layers/nnom_dense.h
+++ b/components/ai/nnom/inc/layers/nnom_dense.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_DENSE_H__
+#define __NNOM_DENSE_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+typedef struct _nnom_dense_layer_t
+{
+	nnom_layer_t super;
+	size_t output_unit;
+	nnom_tensor_t *weight;
+	nnom_tensor_t *bias;
+	nnom_qformat_param_t *output_rshift;			
+	nnom_qformat_param_t *bias_lshift;
+} nnom_dense_layer_t;
+
+// a machine interface for configuration
+typedef struct _nnom_dense_config_t
+{
+	nnom_layer_config_t super;
+	nnom_qtype_t qtype; 	//quantisation type(per channel or per layer)
+	nnom_tensor_t *weight;
+	nnom_tensor_t *bias;
+	nnom_qformat_param_t *output_shift;			
+	nnom_qformat_param_t *bias_shift;
+} nnom_dense_config_t;
+
+// method
+nnom_status_t dense_free(nnom_layer_t *layer);
+nnom_status_t dense_build(nnom_layer_t *layer);
+nnom_status_t dense_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *dense_s(const nnom_dense_config_t *config);
+nnom_layer_t *Dense(size_t output_unit, const nnom_weight_t *w, const nnom_bias_t *b);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_DENSE_H__ */
--- a/components/ai/nnom/inc/layers/nnom_dw_conv2d.h
+++ b/components/ai/nnom/inc/layers/nnom_dw_conv2d.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_DW_CONV2D_H__
+#define __NNOM_DW_CONV2D_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+#include "layers/nnom_conv2d.h"
+
+// method
+nnom_status_t dw_conv2d_build(nnom_layer_t *layer);
+nnom_status_t dw_conv2d_run(nnom_layer_t *layer);
+
+//API
+nnom_layer_t *dw_conv2d_s(const nnom_conv2d_config_t *config);
+nnom_layer_t *DW_Conv2D(uint32_t multiplier, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d, nnom_padding_t pad_type,
+						const nnom_weight_t *w, const nnom_bias_t *b);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_DW_CONV2D_H__ */
--- a/components/ai/nnom/inc/layers/nnom_flatten.h
+++ b/components/ai/nnom/inc/layers/nnom_flatten.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_FLATTEN_H__
+#define __NNOM_FLATTEN_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+// no special parameters but we need it. 
+typedef struct _nnom_flatten_config_t{
+    nnom_layer_config_t super;
+} nnom_flatten_config_t;
+
+// method
+nnom_status_t flatten_build(nnom_layer_t *layer);
+nnom_status_t flatten_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *flatten_s(const nnom_flatten_config_t *config);
+nnom_layer_t *Flatten(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_FLATTEN_H__ */
--- a/components/ai/nnom/inc/layers/nnom_global_pool.h
+++ b/components/ai/nnom/inc/layers/nnom_global_pool.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_GLOBAL_POOL_H__
+#define __NNOM_GLOBAL_POOL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+#include "layers/nnom_maxpool.h"
+
+typedef struct _nnom_global_pool_config_t
+{
+    nnom_layer_config_t super;
+    int16_t output_shift;
+}nnom_global_pool_config_t;
+
+// method
+nnom_status_t global_pool_build(nnom_layer_t *layer);
+
+// API
+nnom_layer_t * global_maxpool_s(const nnom_global_pool_config_t *config);
+nnom_layer_t * global_avgpool_s(const nnom_global_pool_config_t *config);
+nnom_layer_t * global_sumpool_s(const nnom_global_pool_config_t *config);
+
+nnom_layer_t *GlobalMaxPool(void);
+nnom_layer_t *GlobalAvgPool(void);
+nnom_layer_t *GlobalSumPool(void);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_GLOBAL_POOL_H__ */
--- a/components/ai/nnom/inc/layers/nnom_gru_cell.h
+++ b/components/ai/nnom/inc/layers/nnom_gru_cell.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-08-27     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_GRU_CELL_H__
+#define __NNOM_GRU_CELL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "nnom_rnn.h"
+#include "nnom_activation.h"
+
+typedef struct _nnom_gru_cell_config_t
+{
+	nnom_layer_config_t super;
+	nnom_tensor_t *weights;
+	nnom_tensor_t* recurrent_weights;
+	nnom_tensor_t *bias;
+	nnom_qformat_param_t q_dec_z, q_dec_h; // z, r, h
+	uint16_t units;
+} nnom_gru_cell_config_t;
+
+
+typedef struct _nnom_gru_cell_t
+{
+	nnom_rnn_cell_t super;
+
+	nnom_tensor_t* weights;
+	nnom_tensor_t* recurrent_weights;
+	nnom_tensor_t* bias;
+
+    // decide later. 
+    // z, r, h
+	nnom_qformat_param_t q_dec_z, q_dec_h;
+	nnom_qformat_param_t oshift_iw, oshift_hw, bias_shift;
+
+} nnom_gru_cell_t;
+
+// gru
+nnom_rnn_cell_t *gru_cell_s(const nnom_gru_cell_config_t* config);
+
+nnom_status_t gru_cell_free(nnom_rnn_cell_t* cell);
+nnom_status_t gru_cell_build(nnom_rnn_cell_t* cell);
+nnom_status_t gru_cell_run(nnom_rnn_cell_t* cell);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_GRU_CELL_H__ */
--- a/components/ai/nnom/inc/layers/nnom_input.h
+++ b/components/ai/nnom/inc/layers/nnom_input.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_INPUT_H__
+#define __NNOM_INPUT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+// IO layer
+typedef struct _nnom_io_layer
+{
+	nnom_layer_t super;
+	nnom_3d_shape_t shape;
+	nnom_qformat_param_t dec_bit;
+	void *buf; //input or output
+} nnom_io_layer_t;
+
+typedef struct _nnom_io_config_t
+{
+	nnom_layer_config_t super;
+	nnom_tensor_t *tensor; 
+}nnom_io_config_t;
+
+// method
+nnom_status_t input_build(nnom_layer_t *layer);
+nnom_status_t input_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *input_s(const nnom_io_config_t* config);
+nnom_layer_t *Input(nnom_3d_shape_t input_shape, void *p_buf);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_INPUT_H__ */
--- a/components/ai/nnom/inc/layers/nnom_lambda.h
+++ b/components/ai/nnom/inc/layers/nnom_lambda.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_LAMBDA_H__
+#define __NNOM_LAMBDA_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+#include "layers/nnom_input.h"
+
+// lambda layer
+typedef struct _nnom_lambda_layer_t
+{
+	nnom_layer_t super;
+	void *parameters;							  // parameters for lambda
+} nnom_lambda_layer_t;
+
+// lambda layer
+typedef struct _nnom_lambda_config_t
+{
+	nnom_layer_config_t super;
+	nnom_status_t (*run_func_name)(nnom_layer_t *layer);	// run method. required
+	nnom_status_t (*build_func_name)(nnom_layer_t *layer);// compute output buffer shape. can be left null, will call default_build()
+	nnom_status_t (*free_func_name)(nnom_layer_t *layer);	// a callback to free private resources (comp buf not included) can be left null
+	void *parameters;							// parameters for lambda
+} nnom_lambda_config_t;
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_LAMBDA_H__ */
--- a/components/ai/nnom/inc/layers/nnom_lstm_cell.h
+++ b/components/ai/nnom/inc/layers/nnom_lstm_cell.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-08-24     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_LSTM_CELL_H__
+#define __NNOM_LSTM_CELL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "nnom_rnn.h"
+#include "nnom_activation.h"
+
+// a machine interface for configuration
+typedef struct _nnom_lstm_cell_config_t
+{
+	nnom_layer_config_t super;
+	nnom_tensor_t *weights;
+	nnom_tensor_t* recurrent_weights;
+	nnom_tensor_t *bias;
+	nnom_qformat_param_t q_dec_z, q_dec_h, q_dec_c; // z = iw + hw, c = cell state; h=output and memory
+	uint16_t units;
+} nnom_lstm_cell_config_t;
+
+
+typedef struct _nnom_lstm_cell_t
+{
+	nnom_rnn_cell_t super;
+
+	nnom_tensor_t* weights;
+	nnom_tensor_t* recurrent_weights;
+	nnom_tensor_t* bias;
+
+	// experimental, 
+	// iw: input x weight
+	// hw: hidden state x recurrent weight
+	// h: hidden state (memor)
+	// c: cell state
+	nnom_qformat_param_t q_dec_z, q_dec_h, q_dec_c;
+	nnom_qformat_param_t oshift_iw, oshift_hw, oshift_zc, bias_shift;
+
+} nnom_lstm_cell_t;
+
+// LSTM
+nnom_rnn_cell_t *lstm_cell_s(const nnom_lstm_cell_config_t* config);
+
+nnom_status_t lstm_cell_free(nnom_rnn_cell_t* cell);
+nnom_status_t lstm_cell_q7_q15_build(nnom_rnn_cell_t* cell);
+nnom_status_t lstm_cell_q7_q15_run(nnom_rnn_cell_t* cell);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_LSTM_CELL_H__ */
--- a/components/ai/nnom/inc/layers/nnom_matrix.h
+++ b/components/ai/nnom/inc/layers/nnom_matrix.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_MATRIX_H__
+#define __NNOM_MATRIX_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+// the maximum input layer hooked to this layer
+#define MAX_INPUT_LAYER 8
+
+// matrix layer
+typedef struct _nnom_matrix_layer_t
+{
+	nnom_layer_t super;
+	int16_t oshift;		// output right shift
+} nnom_matrix_layer_t;
+
+typedef struct _nnom_matrix_config_t
+{
+	nnom_layer_config_t super;
+	int16_t output_shift;		// output right shift
+} nnom_matrix_config_t;
+
+// methods
+nnom_layer_t* _same_shape_matrix_layer(void);
+nnom_status_t add_run(nnom_layer_t *layer);
+nnom_status_t sub_run(nnom_layer_t *layer);
+nnom_status_t mult_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *add_s(const nnom_matrix_config_t * config);
+nnom_layer_t *sub_s(const nnom_matrix_config_t * config);
+nnom_layer_t *mult_s(const nnom_matrix_config_t * config);
+nnom_layer_t *Add(int16_t oshift);
+nnom_layer_t *Sub(int16_t oshift);
+nnom_layer_t *Mult(int16_t oshift);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_MATRIX_H__ */
--- a/components/ai/nnom/inc/layers/nnom_maxpool.h
+++ b/components/ai/nnom/inc/layers/nnom_maxpool.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_MAXPOOL_H__
+#define __NNOM_MAXPOOL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+// Max Pooling
+typedef struct _nnom_maxpool_layer_t
+{
+	nnom_layer_t super;
+	nnom_3d_shape_t kernel;
+	nnom_3d_shape_t stride;
+	nnom_3d_shape_t pad;
+	nnom_padding_t padding_type;
+	int16_t output_shift;			// reserve
+} nnom_maxpool_layer_t;
+
+// a machine interface for configuration
+typedef struct _nnom_pool_config_t
+{
+	nnom_layer_config_t super;
+	nnom_padding_t padding_type;
+	int16_t output_shift;
+	int8_t kernel_size[2];
+	int8_t stride_size[2];
+	int8_t num_dim;
+} nnom_pool_config_t;
+
+// method
+nnom_status_t maxpool_build(nnom_layer_t *layer);
+nnom_status_t maxpool_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *maxpool_s(const nnom_pool_config_t * config);
+nnom_layer_t *MaxPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad_type);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_MATRIX_H__ */
--- a/components/ai/nnom/inc/layers/nnom_output.h
+++ b/components/ai/nnom/inc/layers/nnom_output.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_OUTPUT_H__
+#define __NNOM_OUTPUT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+#include "layers/nnom_input.h"
+
+// method
+nnom_status_t output_build(nnom_layer_t *layer);
+nnom_status_t output_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *output_s(const nnom_io_config_t* config);
+nnom_layer_t *Output(nnom_3d_shape_t output_shape, void *p_buf);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_OUTPUT_H__ */
--- a/components/ai/nnom/inc/layers/nnom_rnn.h
+++ b/components/ai/nnom/inc/layers/nnom_rnn.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_RNN_H__
+#define __NNOM_RNN_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+// a machine interface for configuration
+typedef struct _nnom_rnn_config_t
+{
+	nnom_layer_config_t super;
+	bool return_sequence;
+	bool stateful;
+	bool go_backwards;
+} nnom_rnn_config_t;
+
+// RNN cell base type
+typedef struct _nnom_rnn_cell_t
+{
+	nnom_status_t (*run)(struct _nnom_rnn_cell_t* cell); // cell runner
+	nnom_status_t (*build)(struct _nnom_rnn_cell_t* cell); // cell builder, calculate buffer size, output data size
+	nnom_status_t (*free)(struct _nnom_rnn_cell_t* cell); // 
+	nnom_layer_t *layer;				// pointer to its layer holder
+	nnom_layer_config_t *config;		// config for the cell event it is a layer type		
+	nnom_rnn_cell_type_t type;	
+
+	void *in_data;						// input data
+	void *out_data;						// output data
+	void *in_state;					// input state data (or hidden state)
+	void *out_state;				// output state data
+
+	size_t comp_buf_size;			// the size of temporary buffer. 
+	size_t state_size; 				// the size of hidden state
+	uint16_t units;					// the output units 
+	uint16_t feature_size;			// the input feature size (vector size)
+
+	size_t macc; // stat of MAC count. 
+} nnom_rnn_cell_t;
+
+typedef struct _nnom_rnn_layer_t
+{
+	nnom_layer_t super;
+	nnom_rnn_cell_t *cell;
+	void *state_buf;		// memory allocated to store state, size = 2 x size of state required by cell. 
+
+	uint16_t timestamp_size;// size of timestamp
+	bool return_sequence; 	// whether to return the output for each unit (sequence)
+	bool stateful;			// whether the states are kept after one inteference
+	bool go_backwards;		// whether go backwards timestamping
+} nnom_rnn_layer_t;
+
+
+// rnn layer 
+nnom_layer_t *rnn_s(nnom_rnn_cell_t *cell, const nnom_rnn_config_t* config);
+
+nnom_status_t rnn_run(nnom_layer_t* layer);
+nnom_status_t rnn_build(nnom_layer_t* layer);
+nnom_status_t rnn_free(nnom_layer_t* layer);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_RNN_H__ */
--- a/components/ai/nnom/inc/layers/nnom_simple_cell.h
+++ b/components/ai/nnom/inc/layers/nnom_simple_cell.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-08-20     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_SIMPLE_CELL_H__
+#define __NNOM_SIMPLE_CELL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "nnom_rnn.h"
+#include "nnom_activation.h"
+
+
+// This Simple Cell replicate the Keras's SimpleCell as blow 
+/*
+ def call(self, inputs, states, training=None):
+    prev_output = states[0] if nest.is_sequence(states) else states
+
+	h = K.dot(inputs, self.kernel)
+	h = K.bias_add(h, self.bias)
+
+    output = h + K.dot(prev_output, self.recurrent_kernel)
+    output = self.activation(output)
+
+    new_state = [output] if nest.is_sequence(states) else output
+    return output, new_state
+*/
+
+// a machine interface for configuration
+typedef struct _nnom_simple_cell_config_t
+{
+	nnom_layer_config_t super;
+	nnom_tensor_t *weights;
+	nnom_tensor_t* recurrent_weights;
+	nnom_tensor_t *bias;
+	nnom_qformat_param_t q_dec_iw, q_dec_hw, q_dec_h;
+	nnom_activation_type_t act_type;		// type of the activation
+	uint16_t units;
+} nnom_simple_cell_config_t;
+
+
+typedef struct _nnom_simple_cell_t
+{
+	nnom_rnn_cell_t super;
+	nnom_activation_type_t act_type;
+
+	nnom_tensor_t* weights;
+	nnom_tensor_t* recurrent_weights;
+	nnom_tensor_t* bias;
+
+	// experimental, 
+	// iw: input x weight
+	// hw: hidden state x recurrent weight
+	// h: hidden state
+	nnom_qformat_param_t q_dec_iw, q_dec_hw, q_dec_h;
+	nnom_qformat_param_t oshift_iw, oshift_hw, bias_shift;
+	
+} nnom_simple_cell_t;
+
+
+// RNN cells
+// The shape for RNN input is (batch, timestamp, feature), where batch is always 1. 
+//
+// SimpleCell
+nnom_rnn_cell_t *simple_cell_s(const nnom_simple_cell_config_t* config);
+
+nnom_status_t simple_cell_free(nnom_rnn_cell_t* cell);
+nnom_status_t simple_cell_build(nnom_rnn_cell_t* cell);
+nnom_status_t simple_cell_run(nnom_rnn_cell_t* cell);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_SIMPLE_CELL_H__ */
--- a/components/ai/nnom/inc/layers/nnom_softmax.h
+++ b/components/ai/nnom/inc/layers/nnom_softmax.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_SOFTMAX_H__
+#define __NNOM_SOFTMAX_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+typedef struct _nnom_softmax_config_t
+{
+    nnom_layer_config_t super;
+} nnom_softmax_config_t;
+
+
+// method
+nnom_status_t softmax_run(nnom_layer_t *layer);
+nnom_status_t softmax_build(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *softmax_s(const nnom_softmax_config_t * config);
+nnom_layer_t *Softmax(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_SOFTMAX_H__ */
--- a/components/ai/nnom/inc/layers/nnom_sumpool.h
+++ b/components/ai/nnom/inc/layers/nnom_sumpool.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_SUMPOOL_H__
+#define __NNOM_SUMPOOL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+#include "layers/nnom_maxpool.h"
+
+// Sum Pooling
+typedef nnom_maxpool_layer_t nnom_sumpool_layer_t;
+
+// method
+nnom_status_t sumpool_build(nnom_layer_t *layer);
+nnom_status_t sumpool_run(nnom_layer_t *layer);
+
+// API
+nnom_layer_t *sumpool_s(const nnom_pool_config_t * config);
+nnom_layer_t *SumPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad_type);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_SUMPOOL_H__ */
--- a/components/ai/nnom/inc/layers/nnom_upsample.h
+++ b/components/ai/nnom/inc/layers/nnom_upsample.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_UPSAMPLE_H__
+#define __NNOM_UPSAMPLE_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+// Up Sampling layer (UnPooling)
+typedef struct _nnom_upsample_layer_t
+{
+	nnom_layer_t super;
+	nnom_3d_shape_t kernel;
+} nnom_upsample_layer_t;
+
+typedef struct _nnom_upsample_config_t
+{
+	nnom_layer_config_t super;
+	nnom_shape_data_t kernel[2];
+} nnom_upsample_config_t;
+
+// API
+nnom_layer_t *upsample_s(const nnom_upsample_config_t *config);
+nnom_layer_t *UpSample(nnom_3d_shape_t kernel);
+
+// Methods
+nnom_status_t upsample_build(nnom_layer_t *layer);
+nnom_status_t upsample_run(nnom_layer_t *layer);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_UPSAMPLE_H__ */
--- a/components/ai/nnom/inc/layers/nnom_zero_padding.h
+++ b/components/ai/nnom/inc/layers/nnom_zero_padding.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-03     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_ZERO_PADDING_H__
+#define __NNOM_ZERO_PADDING_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_layers.h"
+#include "nnom_local.h"
+#include "nnom_tensor.h"
+
+typedef struct _nnom_zero_padding_config_t
+{
+	nnom_layer_config_t super;
+	nnom_border_t pad;
+} nnom_zero_padding_config_t;
+
+// zero padding
+typedef struct _nnom_zero_padding_layer_t
+{
+	nnom_layer_t super;
+	nnom_border_t pad;
+} nnom_zero_padding_layer_t;
+
+// API
+nnom_layer_t *zeropadding_s(const nnom_zero_padding_config_t* config);
+nnom_layer_t *ZeroPadding(nnom_border_t pad);
+
+// method
+nnom_status_t zero_padding_build(nnom_layer_t *layer);
+nnom_status_t zero_padding_run(nnom_layer_t *layer);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_ZERO_PADDING_H__ */
--- a/components/ai/nnom/inc/nnom.h
+++ b/components/ai/nnom/inc/nnom.h
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-02-05     Jianjia Ma   The first version
+ * 2019-02-10     Jianjia Ma   Compiler supports dense net connection
+ */
+
+#ifndef __NNOM_H__
+#define __NNOM_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include <math.h>
+
+#include "nnom_port.h"
+
+#define NNOM_ALIGN  (sizeof(char*))     // alignment when doing memory ops. Equal to size of pointer in byte.
+#define q7_t 	int8_t
+#define q15_t 	int16_t
+#define q31_t 	int32_t
+#define q63_t 	int64_t
+
+/* version */
+#define NNOM_MAJORVERSION     0              /**< major version number */
+#define NNOM_SUBVERSION       4              /**< minor version number */
+#define NNOM_REVISION         3              /**< revise version number */
+#define NNOM_VERSION          ((NNOM_MAJORVERSION * 10000) + (NNOM_SUBVERSION * 100) + NNOM_REVISION)
+
+#ifdef ARM_NN_TRUNCATE
+#define NNOM_TRUNCATE
+#endif
+
+#ifndef NNOM_TRUNCATE 
+    #define NNOM_ROUND(out_shift) ((0x1 << out_shift) >> 1 )
+#else
+    #define NNOM_ROUND(out_shift) 0
+#endif
+										 
+typedef enum
+{
+	NN_SUCCESS = 0,			/**< No error */
+	NN_ARGUMENT_ERROR = -1, /**< One or more arguments are incorrect */
+	NN_LENGTH_ERROR = -2,   /**< Length of data buffer is incorrect */
+	NN_SIZE_MISMATCH = -3,  /**< Size of matrices is not compatible with the operation. */
+	NN_NANINF = -4,			/**< Not-a-number (NaN) or infinity is generated */
+	NN_SINGULAR = -5,		/**< Generated by matrix inversion if the input matrix is singular and cannot be inverted. */
+	NN_TEST_FAILURE = -6,   /**< Test Failed  */
+	NN_NO_MEMORY = -7,
+	NN_MORE_TODO = -8
+} nnom_status_t;
+
+typedef enum
+{
+	NNOM_INVALID = 0,
+	NNOM_BASE,
+	NNOM_INPUT,
+	NNOM_OUTPUT,
+	NNOM_CONV_2D,
+	NNOM_DW_CONV_2D,
+	NNOM_CONV2D_TRANS,
+	NNOM_BATCHNORM,
+	NNOM_DENSE,
+	NNOM_ZERO_PADDING,
+	NNOM_CROPPING,
+	NNOM_RNN,
+	NNOM_ACTIVATION,
+	NNOM_RELU,
+	NNOM_LEAKY_RELU,
+	NNOM_ADV_RELU,
+	NNOM_SIGMOID,
+	NNOM_TANH,
+	NNOM_SOFTMAX,
+	NNOM_MAXPOOL,
+	NNOM_GLOBAL_MAXPOOL,
+	NNOM_AVGPOOL,
+	NNOM_GLOBAL_AVGPOOL,
+	NNOM_SUMPOOL,
+	NNOM_GLOBAL_SUMPOOL,
+	NNOM_UPSAMPLE,
+	NNOM_FLATTEN,
+	NNOM_LAMBDA,
+	NNOM_CONCAT,
+	NNOM_ADD,
+	NNOM_SUB,
+	NNOM_MULT,
+	NNOM_TYPE_MAX
+
+} nnom_layer_type_t;
+
+#define DEFUALT_LAYER_NAMES \
+	{                       \
+		"Unknown",          \
+			"Base",			\
+			"Input",        \
+			"Output",       \
+			"Conv2D",       \
+			"DW_Conv2D",    \
+			"Conv2DTrsp",    \
+			"BatchNorm",	\
+			"Dense",        \
+			"ZeroPad",	    \
+			"Cropping",     \
+			"RNN",          \
+			"Activation",   \
+			"ReLU",         \
+			"Leaky_ReLU",	\
+			"Adv_ReLU",	    \
+			"Sigmoid",      \
+			"Tanh",         \
+			"Softmax",      \
+			"MaxPool",      \
+			"GL_MaxPool",	\
+			"AvgPool",      \
+			"GL_AvgPool",	\
+			"SumPool",		\
+			"GL_SumPool",	\
+			"UpSample",		\
+			"Flatten",      \
+			"Lambda",       \
+			"Concat",       \
+			"Add",          \
+			"Sub",          \
+			"Mult",         \
+	}
+extern const char default_layer_names[][12];
+
+// We dont count softmax an activation here, softmax is instanced as a layer
+typedef enum
+{
+    ACT_UNKNOWN = 0,
+	ACT_RELU,
+	ACT_LEAKY_RELU,
+	ACT_ADV_RELU,
+	ACT_TANH,
+	ACT_SIGMOID,
+    ACT_HARD_TANH,
+    ACT_HARD_SIGMOID
+} nnom_activation_type_t;
+
+#define ACTIVATION_NAMES \
+	{                    \
+        "Unknown",          \
+		"ReLU",          \
+		"LkyReLU",		 \
+		"AdvReLU",		\
+		"TanH",      \
+		"Sigmoid",   \
+        "HrdTanH",      \
+		"HrdSigd",   \
+	}
+extern const char default_activation_names[][8];
+
+// RNN cell type
+typedef enum
+{
+	NNOM_UNKOWN_CELL = 0,
+	NNOM_SIMPLE_CELL,
+	NNOM_GRU_CELL,
+	NNOM_LSTM_CELL,
+	NNOM_CELL_TYPE_MAX
+} nnom_rnn_cell_type_t;
+
+#define DEFUALT_CELL_NAMES \
+	{                    \
+		"Unknown",          \
+		"Simple",		 \
+		"GRU",		\
+		"LSTM",      \
+	}
+extern const char default_cell_names[][8];
+
+
+// parameters
+typedef enum
+{
+	PADDING_VALID = 0,
+	PADDING_SAME
+} nnom_padding_t;
+
+#define NNOM_TENSOR_BUF_NULL     (0)	// This buffer is not in used
+#define NNOM_TENSOR_BUF_TEMP     (1)  // The memory in IO is temporary occupided, can be reused by other layer once the computation is done.
+#define NNOM_TENSOR_BUF_RESERVED (2)  // the mem is reserve for this layer only (not to be reused by other layer.
+
+// currently used in compiling.
+#define NNOM_BUF_EMPTY   (0)
+#define NNOM_BUF_FILLED  (1)
+
+// basic types
+#define nnom_qformat_param_t int32_t // this should match the backend, need a better way to do it. 
+#define nnom_shape_data_t uint16_t
+
+typedef struct _nnom_3d_shape_t
+{
+	nnom_shape_data_t h, w, c;
+} nnom_3d_shape_t;
+
+typedef struct _nnom_border_t
+{
+	nnom_shape_data_t top, bottom, left, right;
+} nnom_border_t;
+
+// nnom_3d_shape_axis_t type provide the axis[] format access to nnom_3d_shape_t
+typedef union {
+	nnom_3d_shape_t s;
+	nnom_shape_data_t axis[sizeof(nnom_3d_shape_t) / sizeof(nnom_shape_data_t)];
+} nnom_3d_shape_axis_t;
+
+// tensor quantisation types
+typedef enum
+{
+	NNOM_QTYPE_PER_TENSOR = 0,
+	NNOM_QTYPE_PER_AXIS = 1
+} nnom_qtype_t;
+
+typedef struct _nnom_weights
+{
+	const void *p_value;
+	nnom_qformat_param_t shift;
+} nnom_weight_t;
+
+typedef struct _nnom_bias
+{
+	const void *p_value;
+	nnom_qformat_param_t shift;
+} nnom_bias_t;
+
+// experimental                   
+typedef struct _nnom_tensor_t
+{
+	void* p_data;			// value
+	nnom_shape_data_t *dim; // dimension of this tensor
+	nnom_qformat_param_t *q_dec;	// number of decimal bit for Q format (scale)
+	nnom_qformat_param_t *q_offset;	// offset for each channel
+	nnom_qtype_t qtype;			// the quantisation type	
+	uint8_t num_dim;			// the number of dimension
+	uint8_t bitwidth;			// the data bit width, only support 8bit now
+} nnom_tensor_t;
+
+// nn wrappers
+typedef struct _nnom_layer_t 	nnom_layer_t;
+typedef struct _nnom_layer_io_t nnom_layer_io_t;
+typedef struct _nnom_layer_hook_t nnom_layer_hook_t;
+typedef struct _nnom_mem_block_t nnom_mem_block_t;
+
+// activation wrapper
+typedef struct _nnom_activation_t nnom_activation_t;
+
+typedef struct _nnom_buf
+{
+	nnom_mem_block_t *mem;
+	size_t size;
+	uint8_t type;
+} nnom_buf_t;
+
+// a memory block to store pre-assign memories during compiling. then assigned to each tensor after.
+struct _nnom_mem_block_t
+{
+	void *blk;		// data block location
+	size_t size;	// the maximum size for this block
+	uint8_t owners; // how many layers own this block
+	uint8_t state;  // empty? filled? for static nn, currently only used in compiling
+};
+
+typedef struct _nnom_stat_t
+{
+	size_t macc; //num. of mac operation
+	uint32_t time;
+} nnom_layer_stat_t;
+
+struct _nnom_layer_hook_t
+{
+	nnom_layer_io_t *io;	 // hooked io
+	nnom_layer_hook_t *next; // next hook include secondary hooked layer
+};
+
+struct _nnom_layer_io_t
+{
+	nnom_layer_hook_t hook;		  // for example: (layer->out)--hook--(layer->in)
+	nnom_layer_io_t *aux; 			// point to auxilary I/O (multiple I/O layer)
+	nnom_tensor_t *tensor;		  // experimental 
+	nnom_mem_block_t *mem;		  // memory blocks handles for compiling only. The memory are now pass by tensor. trying to remove it. 
+	nnom_layer_t *owner;		  // which layer owns this io.
+	uint8_t type;
+};
+
+// structured configuration base type
+typedef struct _nnom_layer_config_t
+{
+	char* name;			// the name of the layer prequantiesd model (the model trained by user before converted to nnom)
+} nnom_layer_config_t;
+
+// layers base
+struct _nnom_layer_t
+{
+	nnom_layer_t *shortcut; // shortcut points to the next layer, applied on compiling
+
+	nnom_status_t (*run)(nnom_layer_t *layer);				// run method. required
+	nnom_status_t (*build)(nnom_layer_t *layer);			// compute output buffer shape. can be left null, will call default_build()
+	nnom_status_t (*free)(nnom_layer_t *layer);				// a callback to free private resources (comp buf not included) can be left null
+	nnom_buf_t *comp;		   								// computational buf
+	nnom_activation_t *actail; 								// I have an activation, I have a tail, wooo haaaa, act-tail!!!
+
+	nnom_layer_config_t *config;			// point to the configuration of the layers. for machine api only. 
+	nnom_layer_type_t type; // layer types
+	nnom_layer_io_t *in;	// IO buff, last*layer, states
+	nnom_layer_io_t *out;   // IO buff, next*layer, states
+	nnom_layer_stat_t stat; // stats, timing, ops
+};
+
+// activation base
+struct _nnom_activation_t
+{
+	nnom_status_t (*run)(struct _nnom_activation_t *act);
+	nnom_tensor_t *tensor;
+	nnom_activation_type_t type;
+};
+
+// local static functions when libc is not available
+#ifdef NNOM_USING_STATIC_MEMORY
+    void nnom_set_static_buf(void* buf, size_t size);
+    void *nnom_malloc(size_t size);
+    void nnom_free(void* p);
+#endif //NNOM_USING_STATIC_BUF
+
+typedef struct _nnom_model nnom_model_t;
+
+#include "nnom_tensor.h"
+#include "nnom_layers.h"
+#include "nnom_utils.h"
+
+// models, I dont want to make model class as a child of layer class yet
+struct _nnom_model
+{
+	nnom_layer_t *head;
+	nnom_layer_t *tail;
+
+	// model constructor
+	nnom_status_t (*add)(struct _nnom_model *m, nnom_layer_t *layer);					// has too pass a raw value
+	nnom_layer_t *(*hook)(nnom_layer_t *curr, nnom_layer_t *last);						// create hook between 2 layers' primary IO.
+	nnom_layer_t *(*merge)(nnom_layer_t *method, nnom_layer_t *in1, nnom_layer_t *in2); // an older interface of merge 2 inputs.
+	nnom_layer_t *(*mergex)(nnom_layer_t *method, int num, ...);						// merge a few layers using mutiple input method (concate, add, ...)
+	nnom_layer_t *(*active)(nnom_activation_t *act, nnom_layer_t *target_layer);		// add the activation to the existing layer's tail
+
+	// callback
+	nnom_status_t (*layer_callback)(nnom_model_t *m, nnom_layer_t *layer);				// layer callback will be called after each layer(after actail). 
+
+	// block memory for layers
+	nnom_mem_block_t blocks[NNOM_BLOCK_NUM];
+
+	size_t total_ops;
+
+	bool is_inited; 	//	is this structure initialized
+	bool is_allocated;  //	is this structure allocated by nnom (not by user)
+};
+
+#define NNOM_NULL_CHECK(p)                 \
+	if ((p) == NULL)                       \
+	{                                 	   \
+		NNOM_LOG("Error: NULL object.\n"); \
+		return NN_ARGUMENT_ERROR;          \
+	}
+
+
+// utils
+size_t nnom_alignto(size_t value, uint32_t alignment);
+size_t nnom_io_length(nnom_layer_io_t *io);
+size_t nnom_hook_length(nnom_layer_hook_t *hook);
+
+// memory (malloc + memeset 0)
+void *nnom_mem(size_t size);
+	
+// get how much memory has been taken
+size_t nnom_mem_stat(void);
+
+// Model APIs
+// create or init a model
+nnom_model_t *new_model(nnom_model_t *m);
+// compile as sequencial model
+nnom_status_t sequencial_compile(nnom_model_t *m);
+// compile as functional model
+nnom_status_t model_compile(nnom_model_t *m, nnom_layer_t *input, nnom_layer_t *output);
+// run a prediction
+nnom_status_t model_run(nnom_model_t *m);
+// delete model. 
+void model_delete(nnom_model_t *m);
+// check version
+nnom_status_t check_model_version(unsigned long model_version);
+
+// callback, called after each layer has finished the calculation. 
+// this callback must return NN_SUCCESS for continually run the model. otherwise, model will be returned with the ERROR code. 
+// this function return NN_LENGTH_ERROR if the callback is already set to other. 
+nnom_status_t model_set_callback(nnom_model_t *m, nnom_status_t (*layer_callback)(nnom_model_t *m, nnom_layer_t *layer));
+// delete callback. 
+void model_delete_callback(nnom_model_t *m);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_H__ */
--- a/components/ai/nnom/inc/nnom_layers.h
+++ b/components/ai/nnom/inc/nnom_layers.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-02-05     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_LAYERS_H__
+#define __NNOM_LAYERS_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+
+// properties
+nnom_3d_shape_t shape(size_t h, size_t w, size_t c);
+nnom_3d_shape_t kernel(size_t h, size_t w);
+nnom_3d_shape_t stride(size_t h, size_t w);
+nnom_3d_shape_t dilation(size_t h, size_t w);
+nnom_border_t border(size_t top, size_t bottom, size_t left, size_t right);
+//nnom_qformat_t qformat(int8_t m, int8_t n);
+size_t shape_size(nnom_3d_shape_t* s);
+
+// this function is to add a new IO to current inited IO
+// input, the targeted IO that the new IO will be added to
+// output , the new IO
+nnom_layer_io_t* io_add_aux(nnom_layer_io_t* targeted_io);
+nnom_layer_io_t *io_init(void *owner_layer, nnom_layer_io_t *io);
+
+#define NN_CEILIF(x,y) ((x+y-1)/y)
+
+#include "layers/nnom_activation.h"
+#include "layers/nnom_concat.h"
+#include "layers/nnom_conv2d.h"
+#include "layers/nnom_cropping.h"
+#include "layers/nnom_conv2d_trans.h"
+#include "layers/nnom_dense.h"
+#include "layers/nnom_dw_conv2d.h"
+#include "layers/nnom_flatten.h"
+#include "layers/nnom_global_pool.h"
+#include "layers/nnom_input.h"
+#include "layers/nnom_lambda.h"
+#include "layers/nnom_matrix.h"
+#include "layers/nnom_maxpool.h"
+#include "layers/nnom_avgpool.h"
+#include "layers/nnom_output.h"
+#include "layers/nnom_rnn.h"
+#include "layers/nnom_softmax.h"
+#include "layers/nnom_sumpool.h"
+#include "layers/nnom_upsample.h"
+#include "layers/nnom_zero_padding.h"
+#include "layers/nnom_rnn.h"
+#include "layers/nnom_simple_cell.h"
+#include "layers/nnom_lstm_cell.h"
+#include "layers/nnom_gru_cell.h"
+
+// Layer APIs ******
+// (a summary for each individual layer's files)
+
+// input/output
+nnom_layer_t *Input(nnom_3d_shape_t input_shape, void *p_buf);
+nnom_layer_t *Output(nnom_3d_shape_t output_shape, void *p_buf);
+
+// Pooling
+nnom_layer_t *MaxPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad);
+nnom_layer_t *AvgPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad);
+nnom_layer_t *SumPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad);
+nnom_layer_t *GlobalMaxPool(void);
+nnom_layer_t *GlobalAvgPool(void);
+nnom_layer_t *GlobalSumPool(void);
+
+// padding, cropping, upsample
+nnom_layer_t *UpSample(nnom_3d_shape_t kernel);	
+nnom_layer_t *ZeroPadding(nnom_border_t pad);
+nnom_layer_t *Cropping(nnom_border_t pad);
+
+// Activation
+nnom_layer_t *Activation(nnom_activation_t *act);
+nnom_layer_t *ReLU(void);
+nnom_layer_t *LeakyReLU(float alpha);
+nnom_layer_t *Softmax(void);
+nnom_layer_t *Sigmoid(int32_t dec_bit);  // input dec bit
+nnom_layer_t *TanH(int32_t dec_bit);     // input dec bit 
+
+// Matrix
+nnom_layer_t *Add(int16_t oshift);       // output shift
+nnom_layer_t *Sub(int16_t oshift);       // output shift			
+nnom_layer_t *Mult(int16_t oshift);      // output shift
+
+nnom_layer_t *Flatten(void);
+nnom_layer_t *Concat(int8_t axis);
+// -- NN Constructers --
+// conv2d
+nnom_layer_t *Conv2D(uint32_t filters, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d, nnom_padding_t pad,
+					 const nnom_weight_t *w, const nnom_bias_t *b);
+
+// deconv2d
+nnom_layer_t *Conv2DTrans(uint32_t filters, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d, nnom_padding_t pad,
+					 const nnom_weight_t *w, const nnom_bias_t *b);
+
+// depthwise_convolution
+nnom_layer_t *DW_Conv2D(uint32_t multiplier, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d, nnom_padding_t pad,
+						const nnom_weight_t *w, const nnom_bias_t *b);
+
+// fully connected, dense
+nnom_layer_t *Dense(size_t output_unit, const nnom_weight_t *w, const nnom_bias_t *b);
+
+
+// Lambda Layers
+nnom_layer_t *Lambda(nnom_status_t (*run)(nnom_layer_t *),	// run method, required
+					 nnom_status_t (*build)(nnom_layer_t *), // optional, call default_build() if left null
+					 nnom_status_t (*free)(nnom_layer_t *),   // not required if no resources needs to be deleted, can be left null.
+					 void *parameters);						  // user private parameters for run method, left null if not needed.
+
+// building methods
+nnom_status_t default_build(nnom_layer_t* layer);
+nnom_status_t input_build(nnom_layer_t* layer);
+
+nnom_status_t conv2d_build(nnom_layer_t* layer);
+nnom_status_t dw_conv2d_build(nnom_layer_t* layer);
+nnom_status_t conv2d_trans_build(nnom_layer_t* layer);
+nnom_status_t dense_build(nnom_layer_t* layer);
+nnom_status_t rnn_build(nnom_layer_t* layer);
+
+nnom_status_t upsample_build(nnom_layer_t* layer);
+nnom_status_t zero_padding_build(nnom_layer_t* layer);
+nnom_status_t cropping_build(nnom_layer_t* layer);
+
+nnom_status_t maxpool_build(nnom_layer_t* layer);
+nnom_status_t avgpool_build(nnom_layer_t* layer);
+nnom_status_t sumpool_build(nnom_layer_t* layer);
+nnom_status_t global_pool_build(nnom_layer_t* layer);
+
+nnom_status_t flatten_build(nnom_layer_t* layer);
+nnom_status_t concat_build(nnom_layer_t* layer);
+
+// run
+nnom_status_t input_run(nnom_layer_t* layer);
+nnom_status_t output_run(nnom_layer_t* layer);
+nnom_status_t flatten_run(nnom_layer_t* layer);
+nnom_status_t default_run(nnom_layer_t* layer);  // simply copy data from input to output
+
+nnom_status_t dw_conv2d_run(nnom_layer_t* layer);
+nnom_status_t conv2d_run(nnom_layer_t* layer);
+nnom_status_t conv2d_trans_run(nnom_layer_t* layer);
+nnom_status_t dense_run(nnom_layer_t* layer);
+nnom_status_t rnn_run(nnom_layer_t* layer);
+
+nnom_status_t upsample_run(nnom_layer_t* layer);
+nnom_status_t zero_padding_run(nnom_layer_t* layer);
+nnom_status_t cropping_run(nnom_layer_t* layer);
+
+nnom_status_t activation_run(nnom_layer_t* layer);
+nnom_status_t softmax_run(nnom_layer_t* layer);
+
+nnom_status_t maxpool_run(nnom_layer_t* layer);
+nnom_status_t avgpool_run(nnom_layer_t* layer);
+nnom_status_t sumpool_run(nnom_layer_t* layer);
+
+nnom_status_t concat_run(nnom_layer_t* layer);
+nnom_status_t add_run(nnom_layer_t* layer);
+nnom_status_t sub_run(nnom_layer_t* layer);
+nnom_status_t mult_run(nnom_layer_t* layer);
+
+// Activation APIs
+// Softmax is not considered as activation in NNoM, Softmax is in layer API.
+nnom_activation_t* act_relu(void);
+nnom_activation_t* act_leaky_relu(float alpha);
+nnom_activation_t* act_sigmoid(int32_t dec_bit);
+nnom_activation_t* act_tanh(int32_t dec_bit);
+
+// direct API
+nnom_status_t act_tensor_run(nnom_activation_t* act, nnom_tensor_t* tensor);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __NNOM_LAYERS_H__ */
--- a/components/ai/nnom/inc/nnom_local.h
+++ b/components/ai/nnom/inc/nnom_local.h
@@ -0,0 +1,974 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ * 
+ * Notice: 
+ * Code in this file inlcudes derivative works from CMSIS, which is released under alternative license.
+ * Please check the LICENSE file for detial.
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-02-05     Jianjia Ma   The first version
+ * 2019-03-19     Jianjia Ma   Local C implementation partly from CMSIS-NN
+ */
+
+#ifndef __NNOM_LOCAL_H__
+#define __NNOM_LOCAL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#include "stdint.h"
+#include "nnom_port.h"
+
+#ifdef ARM_NN_TRUNCATE
+#define NNOM_TRUNCATE
+#endif
+
+// SSAT implementation with C code
+#ifndef __NNOM_SSAT
+static inline int __NNOM_SSAT(int32_t value, int32_t bit) {
+    int32_t min = -(1<<(bit-1));
+    int32_t max = (1<<(bit-1)) - 1;
+    if (value < min)
+        return min;
+    else if (value > max)
+        return max;
+    else
+        return value;
+}
+#endif
+
+// USAT implementation with C code
+#ifndef __NNOM_USAT
+static inline int __NNOM_USAT(int32_t value, int32_t bit) {
+    int32_t max = (1<<(bit-1)) - 1;
+    if (value < 0)
+        return 0;
+    else if (value > max)
+        return max;
+    else
+        return value;
+}
+#endif
+
+#define MAX(A, B) ((A) > (B) ? (A) : (B))
+#define MIN(A, B) ((A) < (B) ? (A) : (B))
+
+
+// Those functions/tables below are partially modifed from CMSIS-NN lib
+// https://github.com/ARM-software/CMSIS_5
+//
+void local_avepool_q7_HWC(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	const uint16_t output_shift, // output right shift
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q7_t *Im_out);
+
+void local_avepool_q7_CHW(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	const uint16_t output_shift, // output right shift
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q7_t *Im_out);
+
+// modified from CMSIS-NN test_ref                            
+void local_maxpool_q7_HWC(const q7_t * Im_in, 				// input image
+	const uint16_t dim_im_in_x,   	// input image dimension x or W
+	const uint16_t dim_im_in_y,   	// input image dimension y or H
+	const uint16_t ch_im_in,    	// number of input image channels
+	const uint16_t dim_kernel_x,  	// window kernel size
+	const uint16_t dim_kernel_y,  	// window kernel size
+	const uint16_t padding_x, 		// padding sizes
+	const uint16_t padding_y, 		// padding sizes
+	const uint16_t stride_x,  		// stride
+	const uint16_t stride_y,  		// stride
+	const uint16_t dim_im_out_x,  	// output image dimension x or W
+	const uint16_t dim_im_out_y,  	// output image dimension y or H
+	q7_t * bufferA, 				// a buffer for local storage, NULL by now
+	q7_t * Im_out);
+
+void local_maxpool_q7_CHW(const q7_t * Im_in, 				// input image
+	const uint16_t dim_im_in_x,   	// input image dimension x or W
+	const uint16_t dim_im_in_y,   	// input image dimension y or H
+	const uint16_t ch_im_in,    	// number of input image channels
+	const uint16_t dim_kernel_x,  	// window kernel size
+	const uint16_t dim_kernel_y,  	// window kernel size
+	const uint16_t padding_x, 		// padding sizes
+	const uint16_t padding_y, 		// padding sizes
+	const uint16_t stride_x,  		// stride
+	const uint16_t stride_y,  		// stride
+	const uint16_t dim_im_out_x,  	// output image dimension x or W
+	const uint16_t dim_im_out_y,  	// output image dimension y or H
+	q7_t * bufferA, 				// a buffer for local storage, NULL by now
+	q7_t * Im_out);
+							
+void local_sumpool_q7_HWC(const q7_t * Im_in, // input image
+	const uint16_t dim_im_in_x,   	// input image dimension x or W
+	const uint16_t dim_im_in_y,   	// input image dimension y or H
+	const uint16_t ch_im_in,    	// number of input image channels
+	const uint16_t dim_kernel_x,  	// window kernel size
+	const uint16_t dim_kernel_y,  	// window kernel size
+	const uint16_t padding_x, 		// padding sizes
+	const uint16_t padding_y, 		// padding sizes
+	const uint16_t stride_x,  		// stride
+	const uint16_t stride_y,  		// stride
+	const uint16_t dim_im_out_x,  	// output image dimension x or W
+	const uint16_t dim_im_out_y,  	// output image dimension y or H
+	q7_t * bufferA, 				// a buffer for local storage, size = 4*output_size
+	q7_t * Im_out);
+							
+void local_sumpool_q7_CHW(const q7_t * Im_in, // input image
+	const uint16_t dim_im_in_x,   	// input image dimension x or W
+	const uint16_t dim_im_in_y,   	// input image dimension y or H
+	const uint16_t ch_im_in,    	// number of input image channels
+	const uint16_t dim_kernel_x,  	// window kernel size
+	const uint16_t dim_kernel_y,  	// window kernel size
+	const uint16_t padding_x, 		// padding sizes
+	const uint16_t padding_y, 		// padding sizes
+	const uint16_t stride_x,  		// stride
+	const uint16_t stride_y,  		// stride
+	const uint16_t dim_im_out_x,  	// output image dimension x or W
+	const uint16_t dim_im_out_y,  	// output image dimension y or H
+	q7_t * bufferA, 				// a buffer for local storage, size = 4*output_size
+	q7_t * Im_out);
+
+// customised up sample pooling
+void local_up_sampling_q7_HWC(const q7_t *Im_in,       // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // NULL
+	q7_t *Im_out);
+						  
+void local_up_sampling_q7_CHW(const q7_t *Im_in,       // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // NULL
+	q7_t *Im_out);
+
+void local_convolve_HWC_q7_nonsquare(const q7_t *Im_in,                // input image
+	const uint16_t dim_im_in_x,     // input image dimention x
+	const uint16_t dim_im_in_y,     // input image dimention y
+	const uint16_t ch_im_in,        // number of input image channels
+	const q7_t *wt,                 // kernel weights
+	const uint16_t ch_im_out,       // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,    // filter kernel size x
+	const uint16_t dim_kernel_y,    // filter kernel size y
+	const uint16_t padding_x,       // padding sizes x
+	const uint16_t padding_y,       // padding sizes y
+	const uint16_t stride_x,        // stride x
+	const uint16_t stride_y,        // stride y
+    const uint16_t dilation_x,      // dilation x
+	const uint16_t dilation_y,      // dilation y
+	const q7_t *bias,               // bias
+	const nnom_qformat_param_t *bias_shift,                            // bias shifts
+    const nnom_qformat_param_t *out_shift,                             // output shift
+    const nnom_qtype_t q_type,      // per channel or per tensor
+    q7_t *Im_out,                   // output image
+	const uint16_t dim_im_out_x,    // output image dimension x
+	const uint16_t dim_im_out_y,    // output image dimension y
+	q15_t *bufferA,                 //buffer space for input
+	q7_t *bufferB                   //buffer space for output
+);
+					   
+void local_convolve_CHW_q7_nonsquare(const q7_t *Im_in,                // input image
+	const uint16_t dim_im_in_x,     // input image dimention x
+	const uint16_t dim_im_in_y,     // input image dimention y
+	const uint16_t ch_im_in,        // number of input image channels
+	const q7_t *wt,                 // kernel weights
+	const uint16_t ch_im_out,       // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,    // filter kernel size x
+	const uint16_t dim_kernel_y,    // filter kernel size y
+	const uint16_t padding_x,       // padding sizes x
+	const uint16_t padding_y,       // padding sizes y
+	const uint16_t stride_x,        // stride x
+	const uint16_t stride_y,        // stride y
+    const uint16_t dilation_x,      // dilation x
+	const uint16_t dilation_y,      // dilation y
+	const q7_t *bias,               // bias
+	const nnom_qformat_param_t *bias_shift,                            // bias shifts
+    const nnom_qformat_param_t *out_shift,                             // output shift
+    const nnom_qtype_t q_type,      // per channel or per tensor
+    q7_t *Im_out,                   // output image
+	const uint16_t dim_im_out_x,    // output image dimension x
+	const uint16_t dim_im_out_y,    // output image dimension y
+	q15_t *bufferA,                 //buffer space for input
+	q7_t *bufferB                   //buffer space for output
+);
+
+void local_conv_trans_HWC_q7_nonsquare(const int8_t * Im_in,
+	const uint16_t dim_im_in_x,     // input image dimention x
+	const uint16_t dim_im_in_y,     // input image dimention y
+	const uint16_t ch_im_in,        // number of input image channels
+	const q7_t *wt,                 // kernel weights
+	const uint16_t ch_im_out,       // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,    // filter kernel size x
+	const uint16_t dim_kernel_y,    // filter kernel size y
+	const uint16_t padding_x,       // padding sizes x
+	const uint16_t padding_y,       // padding sizes y
+	const uint16_t stride_x,        // stride x
+	const uint16_t stride_y,        // stride y
+    const uint16_t dilation_x,      // dilation x
+	const uint16_t dilation_y,      // dilation y
+	const q7_t *bias,               // bias
+	const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, // output image
+	const uint16_t dim_im_out_x,    // output image dimension x
+	const uint16_t dim_im_out_y,    // output image dimension y
+	q15_t *bufferA,                 //buffer space for input
+	q7_t *bufferB                   //buffer space for output
+);
+
+void local_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in,// input image
+	const uint16_t dim_im_in_x,     // input image dimention x
+	const uint16_t dim_im_in_y,     // input image dimention y
+	const uint16_t ch_im_in,        // number of input image channels
+	const q7_t *wt,                 // kernel weights
+	const uint16_t ch_im_out,       // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,    // filter kernel size x
+	const uint16_t dim_kernel_y,    // filter kernel size y
+	const uint16_t padding_x,       // padding sizes x
+	const uint16_t padding_y,       // padding sizes y
+	const uint16_t stride_x,        // stride x
+	const uint16_t stride_y,        // stride y
+    const uint16_t dilation_x,      // dilation x
+	const uint16_t dilation_y,      // dilation y
+	const q7_t *bias,               // bias
+	const nnom_qformat_param_t *bias_shift,                            // bias shifts
+    const nnom_qformat_param_t *out_shift,                             // output shift
+    const nnom_qtype_t q_type,      // per channel or per tensor
+    q7_t *Im_out,                   // output image
+	const uint16_t dim_im_out_x,    // output image dimension x
+	const uint16_t dim_im_out_y,    // output image dimension y
+	q15_t *bufferA,                 //buffer space for input
+	q7_t *bufferB                   //buffer space for output
+);
+
+void local_depthwise_separable_conv_CHW_q7_nonsquare(const q7_t *Im_in,// input image
+	const uint16_t dim_im_in_x,     // input image dimention x
+	const uint16_t dim_im_in_y,     // input image dimention y
+	const uint16_t ch_im_in,        // number of input image channels
+	const q7_t *wt,                 // kernel weights
+	const uint16_t ch_im_out,       // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,    // filter kernel size x
+	const uint16_t dim_kernel_y,    // filter kernel size y
+	const uint16_t padding_x,       // padding sizes x
+	const uint16_t padding_y,       // padding sizes y
+	const uint16_t stride_x,        // stride x
+	const uint16_t stride_y,        // stride y
+    const uint16_t dilation_x,      // dilation x
+	const uint16_t dilation_y,      // dilation y
+	const q7_t *bias,               // bias
+	const nnom_qformat_param_t *bias_shift,                            // bias shifts
+    const nnom_qformat_param_t *out_shift,                             // output shift
+    const nnom_qtype_t q_type,      // per channel or per tensor
+    q7_t *Im_out,                   // output image
+	const uint16_t dim_im_out_x,    // output image dimension x
+	const uint16_t dim_im_out_y,    // output image dimension y
+	q15_t *bufferA,                 //buffer space for input
+	q7_t *bufferB                   //buffer space for output
+);
+
+void local_zero_padding_HWC_q7(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q7_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y);  // output image dimension y 
+						 
+void local_zero_padding_CHW_q7(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q7_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y);  // output image dimension y 
+						 
+void local_cropping_HWC_q7(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q7_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y);  // output image dimension y 
+						 
+void local_cropping_CHW_q7(const q7_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q7_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y);  // output image dimension y 
+
+void local_fully_connected_q7_opt(const q7_t * pV,    // pointer to vector
+	const q7_t * pM,    // pointer to matrix
+	const uint16_t dim_vec, // length of the vector
+	const uint16_t num_of_rows, // numCol of A
+	const uint16_t bias_shift,  // amount of left-shift for bias
+	const uint16_t out_shift,   // amount of right-shift for output
+	const q7_t * bias, q7_t * pOut, // output operand
+	q15_t * vec_buffer);
+
+
+void local_fully_connected_q7(const q7_t * pV,    // pointer to vector
+	const q7_t * pM,    		// pointer to matrix
+	const uint16_t dim_vec, 	// length of the vector
+	const uint16_t num_of_rows, // numCol of A
+	const uint16_t bias_shift,  // amount of left-shift for bias
+	const uint16_t out_shift,   // amount of right-shift for output
+	const q7_t * bias, q7_t * pOut, // output operand
+	q15_t * vec_buffer);
+
+// matrix dot, 
+// it takes reorderd weight as input, (see dense layer for detail. this is basiclly a dense opt without bias)
+void local_dot_q7_opt(const q7_t *pV, // pointer to vector
+	const q7_t *pM,               // pointer to matrix
+	const uint16_t dim_vec,       // length of the vector
+	const uint16_t num_of_rows,   // numCol of A
+	const uint16_t out_shift,   // amount of right-shift for output
+	q7_t *pOut);				// result buffer  
+
+void local_dot_q7(const q7_t *pV, // pointer to vector
+	const q7_t *pM,               // pointer to matrix
+	const uint16_t dim_vec,       // length of the vector
+	const uint16_t num_of_rows,   // numCol of A
+	const uint16_t out_shift,     // amount of right-shift for output
+	 q7_t *pOut);                   // output operand)
+
+
+
+// softmax
+void local_softmax_q7(const q7_t * vec_in, const uint32_t dim_vec, q7_t * p_out);
+
+// sigmoid
+void local_sigmoid_q7(q7_t * data, uint32_t size, int16_t int_width);
+
+// tanh
+void local_tanh_q7(q7_t * data, uint32_t size, int16_t int_width);
+
+// relu
+void local_relu_q7(q7_t * data, uint32_t size);
+
+// leaky relu
+void local_leaky_relu_q7(q7_t *data, q7_t alpha, uint32_t size);
+
+// alpha in q7 format with dec_bit=7
+// max and threshold has the same Q format with the activation
+void local_adv_relu_q7(q7_t *data, q7_t alpha, q7_t max, q7_t threshold, uint32_t size);
+
+// hard sigmoid, 
+// y=-1 if x < -2.5
+// y=1  if x > 2.5
+// otherwise y = 0.2 * x + 0.5 (y=0.20315 * x + 0.5)
+void local_hard_sigmoid_q7(q7_t *data, uint32_t size, int16_t dec_bit);
+
+// hard tanh
+// y=-1 if x < -1
+// y=1  if x > 1
+// otherwise y = x
+void local_hard_tanh_q7(q7_t *data, uint32_t size, int16_t dec_bit);
+
+// matrix ops
+void local_mult_q7(q7_t * pSrcA, q7_t * pSrcB, q7_t * pDst, const uint16_t out_shift, uint32_t blockSize);
+
+// add 
+void local_add_q7(q7_t * pSrcA, q7_t * pSrcB, q7_t * pDst, const uint16_t out_shift,  uint32_t blockSize);
+
+// sub 
+void local_sub_q7(q7_t * pSrcA, q7_t * pSrcB, q7_t * pDst, const uint16_t out_shift, uint32_t blockSize);
+
+// take multiple blocks (>2) as input
+void local_multiple_add_q7( q7_t *p_dst,
+                  const int16_t out_shift,
+                  uint32_t block_size,
+                  uint32_t num_block,
+                  q7_t **p_src);
+
+void local_multiple_mult_q7( q7_t *p_dst,
+                  const int16_t out_shift,
+                  uint32_t block_size,
+                  uint32_t num_block,
+                  q7_t **p_src);
+
+void local_multiple_sub_q7( q7_t *p_dst,
+                  const int16_t out_shift,
+                  uint32_t block_size,
+                  uint32_t num_block,
+                  q7_t **p_src);
+
+
+// Below tables credit to CMSIS
+// For more info. check CMSIS-NN lib
+// https://github.com/ARM-software/CMSIS_5/blob/develop/CMSIS/NN/Source/NNSupportFunctions/arm_nntables.c
+static const q7_t nnom_sigmoid_table_q7[256] = {
+    0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e,
+    0x50, 0x52, 0x53, 0x55, 0x57, 0x59, 0x5a, 0x5c,
+    0x5e, 0x5f, 0x61, 0x62, 0x63, 0x65, 0x66, 0x67,
+    0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70,
+    0x71, 0x72, 0x72, 0x73, 0x74, 0x74, 0x75, 0x76,
+    0x76, 0x77, 0x77, 0x78, 0x78, 0x79, 0x79, 0x7a,
+    0x7a, 0x7a, 0x7b, 0x7b, 0x7b, 0x7c, 0x7c, 0x7c,
+    0x7c, 0x7c, 0x7d, 0x7d, 0x7d, 0x7d, 0x7d, 0x7e,
+    0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+    0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x04,
+    0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06,
+    0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09,
+    0x0a, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x0e, 0x0e,
+    0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
+    0x17, 0x19, 0x1a, 0x1b, 0x1d, 0x1e, 0x1f, 0x21,
+    0x22, 0x24, 0x26, 0x27, 0x29, 0x2b, 0x2d, 0x2e,
+    0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
+};
+
+
+static const q7_t nnom_tanh_table_q7[256] = {
+    0x00, 0x08, 0x10, 0x18, 0x1f, 0x27, 0x2e, 0x35,
+    0x3b, 0x41, 0x47, 0x4c, 0x51, 0x56, 0x5a, 0x5e,
+    0x61, 0x65, 0x68, 0x6a, 0x6d, 0x6f, 0x71, 0x72,
+    0x74, 0x75, 0x76, 0x78, 0x78, 0x79, 0x7a, 0x7b,
+    0x7b, 0x7c, 0x7c, 0x7d, 0x7d, 0x7e, 0x7e, 0x7e,
+    0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x81,
+    0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x82,
+    0x82, 0x82, 0x82, 0x82, 0x83, 0x83, 0x84, 0x84,
+    0x85, 0x85, 0x86, 0x87, 0x88, 0x88, 0x8a, 0x8b,
+    0x8c, 0x8e, 0x8f, 0x91, 0x93, 0x96, 0x98, 0x9b,
+    0x9f, 0xa2, 0xa6, 0xaa, 0xaf, 0xb4, 0xb9, 0xbf,
+    0xc5, 0xcb, 0xd2, 0xd9, 0xe1, 0xe8, 0xf0, 0xf8,
+};
+
+
+// ------------ 16bit ops --------------------
+
+void local_avepool_q15_HWC(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	const uint16_t output_shift, // output right shift
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q15_t *Im_out);
+
+void local_avepool_q15_CHW(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	const uint16_t output_shift, // output right shift
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q15_t *Im_out);
+
+void local_maxpool_q15_HWC(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q15_t *Im_out);
+
+void local_maxpool_q15_CHW(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q15_t *Im_out);
+
+void local_sumpool_q15_HWC(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+    const uint16_t output_shift, // output right shift
+	q7_t *bufferA,               // a buffer for local storage, size = 4*output_size
+	q15_t *Im_out);
+
+void local_sumpool_q15_CHW(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t padding_x,    // padding sizes
+	const uint16_t padding_y,    // padding sizes
+	const uint16_t stride_x,     // stride
+	const uint16_t stride_y,     // stride
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	const uint16_t output_shift, // output right shift
+	q7_t *bufferA,               // a buffer for local storage, size = 4*output_size
+	q15_t *Im_out);  
+
+void local_up_sampling_q15_HWC(const q15_t *Im_in,       // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q15_t *Im_out);
+
+ void local_up_sampling_q15_CHW(const q15_t *Im_in,       // input image
+	const uint16_t dim_im_in_x,  // input image dimension x or W
+	const uint16_t dim_im_in_y,  // input image dimension y or H
+	const uint16_t ch_im_in,     // number of input image channels
+	const uint16_t dim_kernel_x, // window kernel size
+	const uint16_t dim_kernel_y, // window kernel size
+	const uint16_t dim_im_out_x, // output image dimension x or W
+	const uint16_t dim_im_out_y, // output image dimension y or H
+	q7_t *bufferA,               // a buffer for local storage, NULL by now
+	q15_t *Im_out);
+
+void local_convolve_HWC_q15_nonsquare(const q15_t *Im_in,   // input image
+	const uint16_t dim_im_in_x,   // input image dimention x
+	const uint16_t dim_im_in_y,   // input image dimention y
+	const uint16_t ch_im_in,      // number of input image channels
+	const q7_t *wt,               // kernel weights
+	const uint16_t ch_im_out,     // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,  // filter kernel size x
+	const uint16_t dim_kernel_y,  // filter kernel size y
+	const uint16_t padding_x,     // padding sizes x
+	const uint16_t padding_y,     // padding sizes y
+	const uint16_t stride_x,      // stride x
+	const uint16_t stride_y,      // stride y
+    const uint16_t dilation_x,    // dilation x
+	const uint16_t dilation_y,    // dilation y
+	const q7_t *bias,             // bias
+	const nnom_qformat_param_t *bias_shift, // bias shifts
+    const nnom_qformat_param_t *out_shift, // output shift
+    const nnom_qtype_t q_type,      // per channel or per tensor
+    q15_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,    // output image dimension x
+	const uint16_t dim_im_out_y,    // output image dimension y
+	q15_t *bufferA,                 //buffer space for input
+	q7_t *bufferB                   //buffer space for output
+);
+void local_convolve_CHW_q15_nonsquare(const q15_t *Im_in,                // input image
+	const uint16_t dim_im_in_x,     // input image dimention x
+	const uint16_t dim_im_in_y,     // input image dimention y
+	const uint16_t ch_im_in,        // number of input image channels
+	const q7_t *wt,                 // kernel weights
+	const uint16_t ch_im_out,       // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,    // filter kernel size x
+	const uint16_t dim_kernel_y,    // filter kernel size y
+	const uint16_t padding_x,       // padding sizes x
+	const uint16_t padding_y,       // padding sizes y
+	const uint16_t stride_x,        // stride x
+	const uint16_t stride_y,        // stride y
+    const uint16_t dilation_x,      // dilation x
+	const uint16_t dilation_y,      // dilation y
+	const q7_t *bias,               // bias
+	const nnom_qformat_param_t *bias_shift,     // bias shifts
+    const nnom_qformat_param_t *out_shift,      // output shift
+    const nnom_qtype_t q_type,      // per channel or per tensor
+    q15_t *Im_out,                   // output image
+	const uint16_t dim_im_out_x,    // output image dimension x
+	const uint16_t dim_im_out_y,    // output image dimension y
+	q15_t *bufferA,                 //buffer space for input
+	q7_t *bufferB                   //buffer space for output
+);
+
+void local_conv_trans_HWC_q15_nonsquare(const int8_t * Im_in,
+	const uint16_t dim_im_in_x,     // input image dimention x
+	const uint16_t dim_im_in_y,     // input image dimention y
+	const uint16_t ch_im_in,        // number of input image channels
+	const q7_t *wt,                 // kernel weights
+	const uint16_t ch_im_out,       // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,    // filter kernel size x
+	const uint16_t dim_kernel_y,    // filter kernel size y
+	const uint16_t padding_x,       // padding sizes x
+	const uint16_t padding_y,       // padding sizes y
+	const uint16_t stride_x,        // stride x
+	const uint16_t stride_y,        // stride y
+    const uint16_t dilation_x,      // dilation x
+	const uint16_t dilation_y,      // dilation y
+	const q7_t *bias,               // bias
+	const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, // output image
+	const uint16_t dim_im_out_x,    // output image dimension x
+	const uint16_t dim_im_out_y,    // output image dimension y
+	q15_t *bufferA,                 //buffer space for input
+	q7_t *bufferB                   //buffer space for output
+);
+
+void local_depthwise_separable_conv_HWC_q15_nonsquare(const q15_t *Im_in,// input image
+	const uint16_t dim_im_in_x,     // input image dimention x
+	const uint16_t dim_im_in_y,     // input image dimention y
+	const uint16_t ch_im_in,        // number of input image channels
+	const q7_t *wt,                 // kernel weights
+	const uint16_t ch_im_out,       // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,    // filter kernel size x
+	const uint16_t dim_kernel_y,    // filter kernel size y
+	const uint16_t padding_x,       // padding sizes x
+	const uint16_t padding_y,       // padding sizes y
+	const uint16_t stride_x,        // stride x
+	const uint16_t stride_y,        // stride y
+    const uint16_t dilation_x,      // dilation x
+	const uint16_t dilation_y,      // dilation y
+	const q7_t *bias,               // bias
+	const nnom_qformat_param_t *bias_shift,                            // bias shifts
+    const nnom_qformat_param_t *out_shift,                             // output shift
+    const nnom_qtype_t q_type,      // per channel or per tensor
+    q15_t *Im_out,                   // output image
+	const uint16_t dim_im_out_x,    // output image dimension x
+	const uint16_t dim_im_out_y,    // output image dimension y
+	q15_t *bufferA,                 //buffer space for input
+	q7_t *bufferB                   //buffer space for output
+);
+
+void local_depthwise_separable_conv_CHW_q15_nonsquare(const q15_t *Im_in,// input image
+	const uint16_t dim_im_in_x,     // input image dimention x
+	const uint16_t dim_im_in_y,     // input image dimention y
+	const uint16_t ch_im_in,        // number of input image channels
+	const q7_t *wt,                 // kernel weights
+	const uint16_t ch_im_out,       // number of filters, i.e., output image channels
+	const uint16_t dim_kernel_x,    // filter kernel size x
+	const uint16_t dim_kernel_y,    // filter kernel size y
+	const uint16_t padding_x,       // padding sizes x
+	const uint16_t padding_y,       // padding sizes y
+	const uint16_t stride_x,        // stride x
+	const uint16_t stride_y,        // stride y
+    const uint16_t dilation_x,      // dilation x
+	const uint16_t dilation_y,      // dilation y
+	const q7_t *bias,               // bias
+	const nnom_qformat_param_t *bias_shift,                            // bias shifts
+    const nnom_qformat_param_t *out_shift,                             // output shift
+    const nnom_qtype_t q_type,      // per channel or per tensor
+    q15_t *Im_out,                   // output image
+	const uint16_t dim_im_out_x,    // output image dimension x
+	const uint16_t dim_im_out_y,    // output image dimension y
+	q15_t *bufferA,                 //buffer space for input
+	q7_t *bufferB                   //buffer space for output
+);
+
+void local_zero_padding_HWC_q15(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q15_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y);   // output image dimension y 
+
+void local_zero_padding_CHW_q15(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q15_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y);   // output image dimension y 
+
+void local_cropping_HWC_q15(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q15_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y);   // output image dimension y 
+
+void local_cropping_CHW_q15(const q15_t *Im_in,           // input image
+	const uint16_t dim_im_in_x,    // input image dimention x
+	const uint16_t dim_im_in_y,    // input image dimention y
+	const uint16_t ch_im_in,       // number of input image channels
+	const uint16_t padding_top,    // padding sizes y
+	const uint16_t padding_bottom, // padding sizes y
+	const uint16_t padding_left,   // padding sizes x
+	const uint16_t padding_right,  // padding sizes x
+	q15_t *Im_out,                  // output image
+	const uint16_t dim_im_out_x,   // output image dimension x
+	const uint16_t dim_im_out_y);   // output image dimension y 
+
+
+void local_dot_q15(const q15_t *pV, // pointer to vector
+	const q15_t *pM,               // pointer to matrix
+	const uint16_t dim_vec,       // length of the vector
+	const uint16_t num_of_rows,   // numCol of A
+	const uint16_t out_shift,     // amount of right-shift for output
+	 q15_t *pOut);                   // output operand)
+
+void local_dot_q15_opt(const q15_t * pV,
+	const q15_t * pM,
+	const uint16_t dim_vec,
+	const uint16_t num_of_rows,
+	const uint16_t out_shift, 
+	q15_t * pOut);
+
+// original implementation
+// this support none bias, the it will perform like a dot. 
+// set the `bias=NULL` to work
+void local_fully_connected_mat_q7_vec_q15(const q15_t * pV, // pointer to vector
+	const q7_t * pM,			// pointer to matrix
+	const uint16_t dim_vec,  	// length of the vector
+	const uint16_t num_of_rows, // numCol of A
+	const uint16_t bias_shift, 	// amount of left-shift for bias
+	const uint16_t out_shift, 	// amount of right-shift for output
+	const q7_t * bias,			// bias
+	q15_t * pOut,				// output
+	q15_t * vec_buffer);     	// not used but to keep the interface same as the ARM's version
+
+// work on recorder matrix
+// this support none bias, set the bias=NULL to work								   
+void local_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV,
+	const q7_t * pM,
+	const uint16_t dim_vec,
+	const uint16_t num_of_rows,
+	const uint16_t bias_shift,
+	const uint16_t out_shift, 
+	const q7_t * bias, 
+	q15_t * pOut, 
+	q15_t * vec_buffer);
+
+// matrix operation Q15
+void local_multiple_add_q15( q15_t *p_dst,
+                  const int16_t out_shift,
+                  uint32_t block_size,
+                  uint32_t num_block,
+                  q15_t **p_src);
+                  
+void local_multiple_mult_q15( q15_t *p_dst,
+                  const int16_t out_shift,
+                  uint32_t block_size,
+                  uint32_t num_block,
+                  q15_t **p_src);
+
+void local_multiple_sub_q15( q15_t *p_dst,
+                  const int16_t out_shift,
+                  uint32_t block_size,
+                  uint32_t num_block,
+                  q15_t **p_src);
+
+void local_mult_q15(q15_t * pSrcA, q15_t * pSrcB, q15_t * pDst, const uint16_t out_shift, uint32_t blockSize);
+
+// add 
+void local_add_q15(q15_t * pSrcA, q15_t * pSrcB, q15_t * pDst, const uint16_t out_shift,  uint32_t blockSize);
+
+// sub 
+void local_sub_q15(q15_t * pSrcA, q15_t * pSrcB, q15_t * pDst, const uint16_t out_shift, uint32_t blockSize);
+
+// Convert Q7 to Q15
+void local_q7_to_q15_no_shift(const q7_t *src, q15_t *des, uint32_t size);
+void local_q7_to_q15(const q7_t *src, q15_t *des, uint32_t size);
+
+// q15 shift to q7
+void local_q15_to_q7(const q15_t *src, q7_t *des,  uint32_t shift, uint32_t size);
+
+// y = 1 - x
+void local_1_minor_z_q15(q15_t *src, q15_t *des, uint16_t dec_bit, uint32_t size);
+
+void local_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out);
+void local_hard_sigmoid_q15(q15_t *data, uint32_t size, int16_t dec_bit);
+void local_hard_tanh_q15(q15_t *data, uint32_t size, int16_t dec_bit);
+void local_relu_q15(q15_t *data, uint32_t size);
+void local_leaky_relu_q15(q15_t *data, q7_t alpha, uint32_t size);
+void local_adv_relu_q15(q15_t *data, q7_t negative_slope, q15_t max, q15_t threshold, uint32_t size);
+void local_sigmoid_q15(q15_t * data, uint32_t size, uint16_t int_width);
+void local_tanh_q15(q15_t * data, uint32_t size, uint16_t int_width);
+
+
+static const q15_t nnom_sigmoid_table_q15[256] = {
+    0x4000, 0x4200, 0x43ff, 0x45fc, 0x47f5, 0x49eb, 0x4bdc, 0x4dc8,
+    0x4fad, 0x518a, 0x5360, 0x552c, 0x56ef, 0x58a8, 0x5a57, 0x5bfb,
+    0x5d93, 0x5f20, 0x60a1, 0x6216, 0x637f, 0x64db, 0x662b, 0x676f,
+    0x68a6, 0x69d2, 0x6af1, 0x6c05, 0x6d0d, 0x6e09, 0x6efb, 0x6fe2,
+    0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7,
+    0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f,
+    0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03,
+    0x7c3f, 0x7c78, 0x7cad, 0x7ce0, 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d,
+    0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81,
+    0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17,
+    0x7f25, 0x7f32, 0x7f3e, 0x7f4a, 0x7f55, 0x7f5f, 0x7f69, 0x7f72,
+    0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa,
+    0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc,
+    0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0,
+    0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed,
+    0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4,
+    0x000b, 0x000c, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011,
+    0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c,
+    0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e,
+    0x0031, 0x0034, 0x0038, 0x003b, 0x003f, 0x0043, 0x0048, 0x004c,
+    0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d,
+    0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce,
+    0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, 0x013e, 0x0152,
+    0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a,
+    0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388,
+    0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8,
+    0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a,
+    0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70,
+    0x0f42, 0x101e, 0x1105, 0x11f7, 0x12f3, 0x13fb, 0x150f, 0x162e,
+    0x175a, 0x1891, 0x19d5, 0x1b25, 0x1c81, 0x1dea, 0x1f5f, 0x20e0,
+    0x226d, 0x2405, 0x25a9, 0x2758, 0x2911, 0x2ad4, 0x2ca0, 0x2e76,
+    0x3053, 0x3238, 0x3424, 0x3615, 0x380b, 0x3a04, 0x3c01, 0x3e00,
+};
+
+
+static const q15_t nnom_tanh_table_q15[256] = {
+    0x0000, 0x07fd, 0x0feb, 0x17b9, 0x1f59, 0x26bf, 0x2ddf, 0x34ae,
+    0x3b27, 0x4142, 0x46fd, 0x4c56, 0x514d, 0x55e2, 0x5a1a, 0x5df6,
+    0x617c, 0x64b0, 0x6797, 0x6a37, 0x6c95, 0x6eb5, 0x709e, 0x7254,
+    0x73dc, 0x753a, 0x7672, 0x7788, 0x787f, 0x795b, 0x7a1e, 0x7acb,
+    0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f,
+    0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48,
+    0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc,
+    0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7,
+    0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7,
+    0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd,
+    0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, 0x7ffe, 0x7ffe, 0x7fff, 0x7fff,
+    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001,
+    0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003,
+    0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, 0x8006, 0x8007,
+    0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013,
+    0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035,
+    0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f,
+    0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183,
+    0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412,
+    0x849b, 0x8535, 0x85e2, 0x86a5, 0x8781, 0x8878, 0x898e, 0x8ac6,
+    0x8c24, 0x8dac, 0x8f62, 0x914b, 0x936b, 0x95c9, 0x9869, 0x9b50,
+    0x9e84, 0xa20a, 0xa5e6, 0xaa1e, 0xaeb3, 0xb3aa, 0xb903, 0xbebe,
+    0xc4d9, 0xcb52, 0xd221, 0xd941, 0xe0a7, 0xe847, 0xf015, 0xf803,
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __NNOM_LOCAL_H__ */
--- a/components/ai/nnom/inc/nnom_tensor.h
+++ b/components/ai/nnom/inc/nnom_tensor.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-02-05     Jianjia Ma   The first version
+ * 2019-02-10     Jianjia Ma   Compiler supports dense net connection
+ */
+
+#ifndef __NNOM_TENSOR_H__
+#define __NNOM_TENSOR_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "nnom.h"
+
+
+void delete_tensor(nnom_tensor_t* t);
+nnom_tensor_t* new_tensor(nnom_qtype_t type, uint32_t num_dim, uint32_t num_channel);
+// set tensor by value
+// for tensor with quantized type NNOM_QTYPE_PER_TENSOR
+nnom_tensor_t* tensor_set_attr_v(nnom_tensor_t* t, 
+		nnom_qformat_param_t dec_bit, nnom_qformat_param_t offset, nnom_shape_data_t* dim, uint32_t num_dim, uint8_t bitwidth);
+nnom_tensor_t* tensor_set_attr(nnom_tensor_t* t, 
+		nnom_qformat_param_t*dec_bit, nnom_qformat_param_t *offset, nnom_shape_data_t* dim, uint32_t num_dim, uint8_t bitwidth);
+nnom_tensor_t* tensor_cpy_attr(nnom_tensor_t* des, nnom_tensor_t* src);
+size_t tensor_get_num_channel(nnom_tensor_t* t);
+size_t tensor_size(nnom_tensor_t* t);
+size_t tensor_size_byte(nnom_tensor_t* t);
+
+// only support 3d tensor
+// change format from CHW to HWC
+// the shape of the data, input data, output data
+void tensor_hwc2chw_q7(nnom_tensor_t* des, nnom_tensor_t* src);
+
+// change format from CHW to HWC
+// the shape of the data, input data, output data
+void tensor_chw2hwc_q7(nnom_tensor_t* des, nnom_tensor_t* src);
+
+// deprecated. 
+void hwc2chw_q7(nnom_3d_shape_t shape, q7_t* p_in, q7_t* p_out);
+void chw2hwc_q7(nnom_3d_shape_t shape, q7_t* p_in, q7_t* p_out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*__NNOM_TENSOR_H__ */
--- a/components/ai/nnom/inc/nnom_utils.h
+++ b/components/ai/nnom/inc/nnom_utils.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-02-05     Jianjia Ma   The first version
+ */
+
+#ifndef __NNOM_UTILS_H__
+#define __NNOM_UTILS_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+
+typedef struct _nnom_predict_t
+{
+	uint16_t *confusion_mat; // confusiong matrix
+	uint32_t *top_k;		 // which stored the num of prediction in rank_k, example: Top-2 = top_k[0]+top_k[1]
+	nnom_model_t *model;	 // the model to run
+	int8_t *buf_prediction;  // the pointer to the output of softmax layer(normally the end of classifier).
+
+	// setting
+	uint32_t label_num;  // number of types in classification
+	uint32_t top_k_size; // number of k that wants to know.
+
+	// running
+	uint32_t predict_count; // how many prediction is done
+
+	//timing
+	uint32_t t_run_total;	// total running time
+	uint32_t t_predict_start; // when it is initial
+	uint32_t t_predict_total; // total time of the whole test
+} nnom_predict_t;
+
+// create a prediction
+// input model, the buf pointer to the softwmax output (Temporary, this can be extract from model)
+// the size of softmax output (the num of lable)
+// the top k that wants to record.
+nnom_predict_t *prediction_create(nnom_model_t *m, int8_t *buf_prediction, size_t label_num, size_t top_k_size); // currently int8_t
+
+// after a new data is set in input
+// feed data to prediction
+// input the current label, (range from 0 to total number of label -1)
+// (the current input data should be set by user manully to the input buffer of the model.)
+// return NN_ARGUMENT_ERROR if parameter error
+nnom_status_t prediction_run(nnom_predict_t *pre, uint32_t true_label, uint32_t* predict_label, float* prob);
+
+// to mark prediction finished
+void prediction_end(nnom_predict_t *pre);
+
+// free all resources
+void prediction_delete(nnom_predict_t *pre);
+
+// print matrix
+void prediction_matrix(nnom_predict_t *pre);
+
+// print top-k
+void prediction_top_k(nnom_predict_t *pre);
+
+// this function is to print sumarry
+void prediction_summary(nnom_predict_t *pre);
+
+// -------------------------------
+
+// stand alone prediction API
+// this api test one set of data, return the prediction
+// return the predicted label
+// return NN_ARGUMENT_ERROR if parameter error
+nnom_status_t nnom_predict(nnom_model_t *m, uint32_t *label, float *prob);
+
+void model_stat(nnom_model_t *m);
+
+void model_io_format(nnom_model_t *m);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*__NNOM_UTILS_H__ */
--- a/components/ai/nnom/port/nnom_port.h
+++ b/components/ai/nnom/port/nnom_port.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-02-05     Jianjia Ma   The first version
+ * 2021-09-08     derekduke    add tos support
+ */
+
+#ifndef __NNOM_PORT_H__
+#define __NNOM_PORT_H__
+
+#include <stdlib.h>
+#include <stdio.h>
+
+/* use static memory */
+#define NNOM_USING_STATIC_MEMORY    // enable to use built in memory allocation on a large static memory block
+                                     // must set buf using "nnom_set_static_buf()" before creating a model. 
+
+/* dynamic memory interfaces */
+/* when libc is not available, you shall implement the below memory interfaces (libc equivalents). */
+#ifndef NNOM_USING_STATIC_MEMORY    
+    //#define nnom_malloc(n)      malloc(n)       
+    //#define nnom_free(p)        free(p)
+		#define nnom_malloc(n)       tos_mmheap_alloc(n)     
+		#define nnom_free(n)         tos_mmheap_free(n)
+#endif
+
+/* memory interface */
+/* when libc is not available, you shall implement your equivalent functions here */
+#define nnom_memset(p,v,s)        memset(p,v,s)        
+#define nnom_memcpy(dst,src,len)  memcpy(dst,src,len)  
+
+/* runtime & debug */
+#define nnom_us_get()       0       // return a microsecond timestamp
+#define nnom_ms_get()       0       // return a millisecond timestamp
+#define NNOM_LOG(...)       printf(__VA_ARGS__)
+
+/* NNoM configuration */
+#define NNOM_BLOCK_NUM  	(16)		// maximum number of memory blocks, increase it when log request.   
+#define DENSE_WEIGHT_OPT 	(1)		// if used fully connected layer optimized weights. 
+
+//#define NNOM_TRUNCATE             // disable: backend ops use round to the nearest int (default). enable: floor 
+
+/* Backend format configuration */
+//#define NNOM_USING_CHW            // uncomment if using CHW format. otherwise using default HWC format.
+                                    // Notes, CHW is incompatible with CMSIS-NN. 
+                                    // CHW must be used when using hardware accelerator such as KPU in K210 chip
+
+/* Backend selection */
+//#define NNOM_USING_CMSIS_NN       // uncomment if use CMSIS-NN for optimation 
+
+
+#endif
+
+
+
--- a/components/ai/nnom/src/backends/nnom_local.c
+++ b/components/ai/nnom/src/backends/nnom_local.c
--- a/components/ai/nnom/src/backends/nnom_local_q15.c
+++ b/components/ai/nnom/src/backends/nnom_local_q15.c
--- a/components/ai/nnom/src/core/nnom.c
+++ b/components/ai/nnom/src/core/nnom.c
--- a/components/ai/nnom/src/core/nnom_layers.c
+++ b/components/ai/nnom/src/core/nnom_layers.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-02-05     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+
+size_t shape_size(nnom_3d_shape_t *s)
+{
+	if (s == NULL)
+		return 0;
+	return s->h * s->w * s->c;
+}
+
+nnom_3d_shape_t shape(size_t h, size_t w, size_t c)
+{
+	nnom_3d_shape_t s;
+	s.h = h;
+	s.w = w;
+	s.c = c;
+	return s;
+}
+nnom_3d_shape_t kernel(size_t h, size_t w)
+{
+	return shape(h, w, 1);
+}
+nnom_3d_shape_t stride(size_t h, size_t w)
+{
+	return shape(h, w, 1);
+}
+nnom_3d_shape_t dilation(size_t h, size_t w)
+{
+	return shape(h, w, 1);
+}
+
+nnom_border_t border(size_t top, size_t bottom, size_t left, size_t right)
+{
+	nnom_border_t b;
+	b.top = top;
+	b.bottom = bottom;
+	b.left = left;
+	b.right = right;
+	return b;
+}
+
+// this function has to be used while assign a io for a layer.
+// because the io needs to know who is its owner.
+nnom_layer_io_t *io_init(void *owner_layer, nnom_layer_io_t *io)
+{
+	io->owner = (nnom_layer_t *)owner_layer;
+	return io;
+}
+
+// this function is to add a new IO to current inited IO
+// input, the targeted IO that the new IO will be added to
+// output , the new IO
+nnom_layer_io_t *io_add_aux(nnom_layer_io_t *targeted_io)
+{
+	nnom_layer_io_t *new_io;
+	// check if the targeted io is inited, and its aux = NULL
+	if (targeted_io == NULL || targeted_io->owner == NULL || targeted_io->aux != NULL)
+		return NULL;
+	// create new io, init it
+	new_io = nnom_mem(sizeof(nnom_layer_io_t));
+	if (new_io == NULL)
+		return NULL;
+	// add to aux
+	targeted_io->aux = new_io;
+	return io_init(targeted_io->owner, new_io);
+}
--- a/components/ai/nnom/src/core/nnom_tensor.c
+++ b/components/ai/nnom/src/core/nnom_tensor.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-02-05     Jianjia Ma   The first version
+ * 2019-02-14	  Jianjia Ma   Add layer.free() method.
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include "nnom.h"
+#include "nnom_tensor.h"
+
+ // tensor size
+size_t tensor_size(nnom_tensor_t* t)
+{
+	size_t size = 0;
+	if (t != NULL)
+	{
+		size = t->dim[0];
+		for (int i = 1; i < t->num_dim; i++)
+			size *= t->dim[i];
+	}
+	return size;
+}
+size_t tensor_size_byte(nnom_tensor_t* t)
+{
+    return tensor_size(t)*t->bitwidth/8;
+}
+
+
+size_t tensor_get_num_channel(nnom_tensor_t* t)
+{
+	// this will need to be changed to support batch. 
+#ifdef NNOM_USING_CHW
+	// channel first
+	//return t->dim[0];
+	return t->dim[t->num_dim -1];		// we are always using hwc to describe even our data is in CHW
+#else
+	// channel last
+	return t->dim[t->num_dim -1];
+#endif
+}
+
+// initialise/create new tensor
+nnom_tensor_t* new_tensor(nnom_qtype_t type, uint32_t num_dim, uint32_t num_channel)
+{
+	nnom_tensor_t* t = NULL;
+	uint32_t q_len;
+	if(type == NNOM_QTYPE_PER_AXIS)
+	{
+		q_len = num_channel;
+	}
+	else if (type == NNOM_QTYPE_PER_TENSOR)
+	{
+		q_len = 1;
+	}
+	else
+	{
+		NNOM_LOG("ERROR: tensor type not specified\n");
+		return NULL;
+	}
+
+	t = nnom_mem(nnom_alignto(sizeof(nnom_tensor_t), NNOM_ALIGN) 
+							+ num_dim*sizeof(nnom_shape_data_t) 
+							+ q_len*sizeof(nnom_qformat_param_t)*2);
+	if(t == NULL)
+		return t;
+	t->dim = (nnom_shape_data_t*)((uint8_t*)t + sizeof(nnom_tensor_t));	// should add alignment
+	t->q_dec = (nnom_qformat_param_t*)((uint8_t*)t->dim + num_dim*sizeof(nnom_shape_data_t));
+	t->q_offset = (nnom_qformat_param_t*)((uint8_t*)t->q_dec + q_len*sizeof(nnom_qformat_param_t));
+	t->num_dim = num_dim;
+	t->qtype = type;
+
+	return t;
+}
+
+void delete_tensor(nnom_tensor_t* t)
+{
+	if (t)
+		nnom_free(t);
+}
+
+// set tensor by value
+// for tensor with quantized type NNOM_QTYPE_PER_TENSOR
+nnom_tensor_t* tensor_set_attr_v(nnom_tensor_t* t, 
+		nnom_qformat_param_t dec_bit, nnom_qformat_param_t offset, nnom_shape_data_t* dim, uint32_t num_dim, uint8_t bitwidth)
+{
+	// copy dim
+	t->num_dim = num_dim;
+	nnom_memcpy(t->dim, dim, sizeof(nnom_shape_data_t) * num_dim);
+
+	// bitwidth
+	t->bitwidth = bitwidth;
+	// copy the offset and q format
+	*(t->q_dec) = dec_bit;
+	*(t->q_offset) = offset;
+	return t;
+}
+
+
+// set tensor by pointer
+// for tensor with quantized type NNOM_QTYPE_PER_AXIS
+nnom_tensor_t* tensor_set_attr(nnom_tensor_t* t, 
+		nnom_qformat_param_t*dec_bit, nnom_qformat_param_t *offset, nnom_shape_data_t* dim, uint32_t num_dim, uint8_t bitwidth)
+{
+	size_t size;
+		
+	// copy dim
+	t->num_dim = num_dim;
+	nnom_memcpy(t->dim, dim, sizeof(nnom_shape_data_t) * num_dim);
+	
+	// get the q format data size
+	if(t->qtype == NNOM_QTYPE_PER_AXIS)
+		size = sizeof(nnom_qformat_param_t) * tensor_get_num_channel(t);
+	else
+		size = sizeof(nnom_qformat_param_t);
+
+	// bitwidth
+	t->bitwidth = bitwidth;
+	// copy the offset and q format
+	nnom_memcpy(t->q_dec, dec_bit, size);
+	nnom_memcpy(t->q_offset, offset, size);
+	return t;
+}
+
+// this method copy the attributes of a tensor to a new tensor
+// before that, src and des tensor must already have QTYPE and NUM_OF_DIM set. 
+// Note, the tensors must have the same lenght. this method wont cpy the memory pointer data (we will assign memory later after building)
+nnom_tensor_t* tensor_cpy_attr(nnom_tensor_t* des, nnom_tensor_t* src)
+{
+	size_t size;
+	if(src->qtype != des->qtype || src->num_dim != des->num_dim)
+		return NULL;
+	
+	if(src->qtype == NNOM_QTYPE_PER_AXIS)
+		size = sizeof(nnom_qformat_param_t) * tensor_get_num_channel(src);
+	else
+		size = sizeof(nnom_qformat_param_t);
+		
+	// bit
+	des->bitwidth = src->bitwidth;
+	// copy quantisation parameters
+	nnom_memcpy(des->q_dec, src->q_dec, size);
+	nnom_memcpy(des->q_offset, src->q_offset, size);
+
+	// copy number of dimension
+	des->num_dim = src->num_dim;
+	nnom_memcpy(des->dim, src->dim, src->num_dim * sizeof(nnom_shape_data_t));
+	return des;
+}
+
+// change format from CHW to HWC
+// the shape of the data, input data, output data
+void tensor_hwc2chw_q7(nnom_tensor_t* des, nnom_tensor_t* src)
+{
+	q7_t* p_out = des->p_data;
+	q7_t* p_in = src->p_data;
+
+	for (int c = 0; c < src->dim[2]; c++)
+	{
+		for (int h = 0; h < src->dim[0]; h++)
+		{
+			for (int w = 0; w < src->dim[1]; w++)
+			{
+				*p_out = p_in[(h * src->dim[1] + w) * src->dim[2] + c];
+				p_out++;
+			}
+		}
+	}
+}
+
+
+// only support 3d tensor
+// change format from CHW to HWC
+void tensor_chw2hwc_q7(nnom_tensor_t* des, nnom_tensor_t* src)
+{
+	q7_t* p_out = des->p_data;
+	q7_t* p_in = src->p_data;
+	int im_size;
+	int h_step;
+
+	im_size = src->dim[0] * src->dim[1]; // H*W
+
+	for (int h = 0; h < src->dim[0]; h++)
+	{
+		h_step = src->dim[1] * h;
+		for (int w = 0; w < src->dim[1]; w++)
+		{
+			for (int c = 0; c < src->dim[2]; c++)
+			{
+				*p_out = p_in[im_size * c + h_step + w];
+				p_out++;
+			}
+		}
+	}
+
+}
+
+// (deprecated by tensor_hwc2chw version)
+// change format from CHW to HWC
+// the shape of the data, input data, output data
+void hwc2chw_q7(nnom_3d_shape_t shape, q7_t* p_in, q7_t* p_out)
+{
+	for (int c = 0; c < shape.c; c++)
+	{
+		for (int h = 0; h < shape.h; h++)
+		{
+			for (int w = 0; w < shape.w; w++)
+			{
+				*p_out = p_in[(h * shape.w + w) * shape.c + c];
+				p_out++;
+			}
+		}
+	}
+}
+
+// (deprecated)
+// change format from CHW to HWC
+// the shape of the data, input data, output data
+void chw2hwc_q7(nnom_3d_shape_t shape, q7_t* p_in, q7_t* p_out)
+{
+	int im_size = shape.w * shape.h;
+	int h_step;
+
+	for (int h = 0; h < shape.h; h++)
+	{
+		h_step = shape.w * h;
+		for (int w = 0; w < shape.w; w++)
+		{
+			for (int c = 0; c < shape.c; c++)
+			{
+				*p_out = p_in[im_size * c + h_step + w];
+				p_out++;
+			}
+		}
+	}
+}
--- a/components/ai/nnom/src/core/nnom_utils.c
+++ b/components/ai/nnom/src/core/nnom_utils.c
@@ -0,0 +1,417 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-02-05     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include "nnom.h"
+#include "nnom_utils.h"
+
+static nnom_predict_t *_predict_create_instance(nnom_model_t *m, size_t label_num, size_t top_k_size)
+{
+	nnom_predict_t *pre;
+	// allocate memory 
+	pre = (nnom_predict_t *)nnom_malloc(sizeof(nnom_predict_t));
+	if(pre == NULL)
+		return NULL;
+	pre->top_k = (uint32_t *)nnom_malloc(top_k_size * sizeof(uint32_t));
+	pre->confusion_mat = (uint16_t *)nnom_malloc(label_num * label_num * sizeof(uint16_t));
+	if(pre->top_k == NULL || pre->confusion_mat == NULL)
+	{
+		nnom_free(pre->top_k); nnom_free(pre->confusion_mat); nnom_free(pre);
+		return NULL;
+	}
+	nnom_memset(pre->top_k, 0, top_k_size * sizeof(uint32_t));
+	nnom_memset(pre->confusion_mat, 0, label_num * label_num * sizeof(uint16_t));
+	
+	// config
+	pre->label_num = label_num;
+	pre->top_k_size = top_k_size;
+	pre->predict_count = 0;
+
+	// run
+	pre->model = m;
+	pre->t_run_total = 0;	// model running time in total
+	pre->t_predict_start = 0; // when it is initial
+	pre->t_predict_total = 0; // total time of the whole test
+
+	return pre;
+}
+
+static void _predict_delete_instance(nnom_predict_t *pre)
+{
+	if(pre == NULL)
+		return;
+	nnom_free(pre->top_k); 
+	nnom_free(pre->confusion_mat);
+	nnom_free(pre); 
+}
+
+// create a prediction
+// input model, the buf pointer to the softwmax output (Temporary, this can be extract from model)
+// the size of softmax output (the num of lable)
+// the top k that wants to record.
+nnom_predict_t *prediction_create(nnom_model_t *m, int8_t *buf_prediction, size_t label_num, size_t top_k_size)
+{
+	nnom_predict_t *pre = _predict_create_instance(m, label_num, top_k_size);
+	if (!pre)
+		return NULL;
+	if (!m)
+	{
+		_predict_delete_instance(pre);
+		return NULL;
+	}
+
+	// set the output buffer of model to the prediction instance
+	pre->buf_prediction = buf_prediction;
+
+	// mark start time.
+	pre->t_predict_start = nnom_ms_get();
+
+	return pre;
+}
+
+// after a new data is set in input
+// feed data to prediction
+// input the current label, (range from 0 to total number of label -1)
+// (the current input data should be set by user manully to the input buffer of the model.)
+nnom_status_t prediction_run(nnom_predict_t *pre, uint32_t true_label, uint32_t*predict_label, float* prob)
+{
+	int max_val;
+	int max_index;
+	uint32_t true_ranking = 0;
+	uint32_t start;
+	uint32_t sum = 0;
+
+	if (!pre)
+		return NN_ARGUMENT_ERROR;
+
+	// now run model
+	start = nnom_ms_get();
+	model_run(pre->model);
+	pre->t_run_total += nnom_ms_get() - start;
+
+	// only draw matrix and top k when number of label > 1
+	if (pre->label_num > 1)
+	{
+		// find how many prediction is bigger than the ground true.
+		// Raning rules, same as tensorflow. however, predictions in MCU is more frequencly to have equal probability since it is using fixed-point.
+		// if ranking is 1, 2, =2(true), 4, 5, 6. the result will be top 3.
+		// if ranking is 1, 2(true), =2, 4, 5, 6. the result will be top 2.
+		// find the ranking of the prediced label.
+		for (uint32_t j = 0; j < pre->label_num; j++)
+		{
+			if (j == true_label)
+				continue;
+			if (pre->buf_prediction[true_label] < pre->buf_prediction[j])
+				true_ranking++;
+			// while value[label] = value[j]. only when label > j, label is the second of j
+			else if (pre->buf_prediction[true_label] == pre->buf_prediction[j] && j < true_label)
+				true_ranking++;
+		}
+
+		if (true_ranking < pre->top_k_size)
+			pre->top_k[true_ranking]++;
+
+		// Find top 1 and return the current prediction.
+		// If there are several maximum prediction, return the first one.
+		max_val = pre->buf_prediction[0];
+		max_index = 0;
+		for (uint32_t j = 1; j < pre->label_num; j++)
+		{
+			if (pre->buf_prediction[j] > max_val)
+			{
+				max_val = pre->buf_prediction[j];
+				max_index = j;
+			}
+			sum += pre->buf_prediction[j];
+		}
+		// result
+		if (max_val != 0)
+			*prob = (float)max_val / 127.f;
+		else
+			*prob = 0;
+		*predict_label = max_index;
+
+		// fill confusion matrix
+		pre->confusion_mat[true_label * pre->label_num + max_index] += 1;
+	}
+	// only one neural as output. 
+	else
+	{
+		*prob = (float)pre->buf_prediction[0] / 127.f;
+		if (*prob >= 0.5f)
+			*predict_label = 1;
+		else
+			*predict_label = 0;
+	}
+
+	// prediction count
+	pre->predict_count++;
+
+	// return the prediction
+	return NN_SUCCESS;
+}
+
+void prediction_end(nnom_predict_t *pre)
+{
+	if (!pre)
+		return;
+	pre->t_predict_total = nnom_ms_get() - pre->t_predict_start;
+}
+
+void prediction_delete(nnom_predict_t *pre)
+{
+	_predict_delete_instance(pre);
+}
+
+void prediction_matrix(nnom_predict_t *pre)
+{
+	if (!pre)
+		return;
+	// print titles
+	NNOM_LOG("\nConfusion matrix:\n");
+	NNOM_LOG("predict");
+	for (int i = 0; i < pre->label_num; i++)
+	{
+		NNOM_LOG("%6d", i);
+	}
+	NNOM_LOG("\n");
+	NNOM_LOG("actual\n");
+	// print the matrix
+	for (int i = 0; i < pre->label_num; i++)
+	{
+		uint32_t row_total = 0;
+
+		NNOM_LOG(" %3d | ", i);
+		for (int j = 0; j < pre->label_num; j++)
+		{
+			row_total += pre->confusion_mat[i * pre->label_num + j];
+			NNOM_LOG("%6d", pre->confusion_mat[i * pre->label_num + j]);
+		}
+		NNOM_LOG("   |%4d%%\n", pre->confusion_mat[i * pre->label_num + i] * 100 / row_total);
+		row_total = 0;
+	}
+	NNOM_LOG("\n");
+}
+
+// top-k
+void prediction_top_k(nnom_predict_t *pre)
+{
+	uint32_t top = 0;
+	if (!pre)
+		return;
+
+	for (int i = 0; i < pre->top_k_size; i++)
+	{
+		top += pre->top_k[i];
+		if (top != pre->predict_count)
+			NNOM_LOG("Top %d Accuracy: %d.%02d%% \n", i + 1, (top * 100) / pre->predict_count,
+					((top * 100 * 100) / pre->predict_count)%100);
+		else
+			NNOM_LOG("Top %d Accuracy: 100%% \n", i + 1);
+	}
+}
+
+// this function is to print sumarry
+void prediction_summary(nnom_predict_t *pre)
+{
+	if (!pre)
+		return;
+	// sumamry
+	NNOM_LOG("\nPrediction summary:\n");
+	NNOM_LOG("Test frames: %d\n", pre->predict_count);
+	NNOM_LOG("Test running time: %d sec\n", pre->t_predict_total / 1000);
+	NNOM_LOG("Model running time: %d ms\n", pre->t_run_total);
+	if(pre->predict_count !=0)
+		NNOM_LOG("Average prediction time: %d us\n", (pre->t_run_total * 1000) / pre->predict_count);
+	if(pre->t_run_total != 0)
+		NNOM_LOG("Average effeciency: %d.%02d ops/us\n", (int)(((uint64_t)pre->model->total_ops * pre->predict_count) / (pre->t_run_total * 1000)),
+			(int)(((uint64_t)pre->model->total_ops * pre->predict_count)*100 / (pre->t_run_total * 1000))%100);
+	if(pre->t_run_total !=0 && pre->predict_count !=0)
+		NNOM_LOG("Average frame rate: %d.%d Hz\n", 1000 / (pre->t_run_total / pre->predict_count),
+			(1000*10 / (pre->t_run_total / pre->predict_count))%10);
+
+	// only valid for multiple labels 
+	if(pre->label_num > 1)
+	{
+		// print top-k
+		prediction_top_k(pre);
+
+		// print confusion matrix
+		prediction_matrix(pre);
+	}
+}
+
+// stand alone prediction API
+// this api test one set of data, return the prediction
+nnom_status_t nnom_predict(nnom_model_t *m, uint32_t *label, float *prob)
+{
+	int32_t max_val, max_index, sum;
+	int8_t *output;
+
+	if (!m)
+		return NN_ARGUMENT_ERROR;
+
+	model_run(m);
+
+	// get the output memory
+	output = m->tail->out->tensor->p_data;
+
+	// multiple neural output
+	if (tensor_size(m->tail->out->tensor) > 1)
+	{
+		// Top 1
+		max_val = output[0];
+		max_index = 0;
+		sum = max_val;
+		for (uint32_t i = 1; i < tensor_size(m->tail->out->tensor); i++)
+		{
+			if (output[i] > max_val)
+			{
+				max_val = output[i];
+				max_index = i;
+			}
+			sum += output[i];
+		}
+		// send results
+		*label = max_index;
+		if(max_val !=0)
+			*prob = (float)max_val/127.f; 
+		else
+			*prob = 0; 
+	}
+	// single neural output
+	else
+	{
+		*prob = (float)output[0] / 127.f;
+		if (*prob >= 0.5f)
+			*label = 1;
+		else
+			*label = 0;
+	}
+	
+	return NN_SUCCESS;
+}
+
+static void layer_stat(nnom_layer_t *layer)
+{
+	// layer stat
+	if(layer->type != NNOM_RNN)
+		NNOM_LOG("%-10s - ", default_layer_names[layer->type]);
+	else
+	{
+		NNOM_LOG("%-3s/", default_layer_names[layer->type]);
+		NNOM_LOG("%-6s - ", default_cell_names[((nnom_rnn_layer_t*)layer)->cell->type]);
+	}
+	NNOM_LOG(" %8d      ", layer->stat.time);
+
+	// MAC operation
+	if(layer->stat.macc == 0)
+		NNOM_LOG("            ");
+	else if (layer->stat.macc < 10000)
+		NNOM_LOG("%7d     ", (uint32_t)layer->stat.macc);
+	else if (layer->stat.macc < 1000*1000)
+		NNOM_LOG("%6dk     ", (uint32_t)(layer->stat.macc/1000));
+	else if (layer->stat.macc < 1000*1000*1000)
+		NNOM_LOG("%3d.%02dM     ", (uint32_t)(layer->stat.macc/(1000*1000)), (uint32_t)(layer->stat.macc%(1000*1000)/(10*1000))); // xxx.xx M
+	else
+		NNOM_LOG("%3d.%02dG     ", (uint32_t)(layer->stat.macc/(1000*1000*1000)), (uint32_t)(layer->stat.macc%(1000*1000*1000)/(10*1000*1000))); // xxx.xx G
+
+	// layer efficiency
+	if (layer->stat.macc != 0 && layer->stat.time != 0)
+		NNOM_LOG("%d.%02d\n", (uint32_t)(layer->stat.macc / layer->stat.time), (uint32_t)((layer->stat.macc * 100) / (layer->stat.time) % 100));
+	else
+		NNOM_LOG("\n");
+}
+
+void model_stat(nnom_model_t *m)
+{
+	size_t total_ops = 0;
+	size_t total_time = 0;
+	nnom_layer_t *layer;
+	uint32_t run_num = 0;
+
+	if (!m)
+		return;
+
+	layer = m->head;
+
+	NNOM_LOG("\nPrint running stat..\n");
+	NNOM_LOG("Layer(#)        -   Time(us)     ops(MACs)   ops/us \n");
+	NNOM_LOG("--------------------------------------------------------\n");
+	while (layer)
+	{
+		run_num++;
+		NNOM_LOG("#%-3d", run_num);
+		total_ops += layer->stat.macc;
+		total_time += layer->stat.time;
+		layer_stat(layer);
+		if (layer->shortcut == NULL)
+			break;
+		layer = layer->shortcut;
+	}
+	NNOM_LOG("\nSummary:\n");
+	NNOM_LOG("Total ops (MAC): %d", (uint32_t)(total_ops));
+	NNOM_LOG("(%d.%02dM)\n", (uint32_t) (total_ops/(1000*1000)), (uint32_t)(total_ops%(1000*1000)/(10000)));
+	NNOM_LOG("Prediction time :%dus\n", (uint32_t)total_time);
+	if(total_time != 0)
+		NNOM_LOG("Efficiency %d.%02d ops/us\n",
+		   (uint32_t)(total_ops / total_time),
+		   (uint32_t)((total_ops * 100) / (total_time) % 100));
+
+	NNOM_LOG("Total memory:%d\n", (uint32_t)nnom_mem_stat());
+}
+
+void model_io_format(nnom_model_t *m)
+{
+	nnom_layer_t *layer;
+	uint32_t run_num = 0;
+
+	if (!m)
+		return;
+
+	layer = m->head;
+
+	NNOM_LOG("\nPrint layer input/output..\n");
+	NNOM_LOG("Layer(#)        -  Input(Qnm)  Output(Qnm)   Oshape \n");
+	NNOM_LOG("----------------------------------------------------------\n");
+	while (layer)
+	{
+		run_num++;
+		NNOM_LOG("#%-3d", run_num);
+		if(layer->type != NNOM_RNN)
+			NNOM_LOG("%-10s - ", default_layer_names[layer->type]);
+		else
+		{
+			NNOM_LOG("%-3s/", default_layer_names[layer->type]);
+			NNOM_LOG("%-6s - ", default_cell_names[((nnom_rnn_layer_t*)layer)->cell->type]);
+		}
+		NNOM_LOG("  %2d.%2d", 7-layer->in->tensor->q_dec[0], layer->in->tensor->q_dec[0]);
+		NNOM_LOG("     %2d.%2d", 7-layer->out->tensor->q_dec[0], layer->out->tensor->q_dec[0]);
+		NNOM_LOG("      (");
+		for (int i = 0; i < 3; i++)
+		{
+			if (layer->out->tensor->num_dim > i)
+				NNOM_LOG("%4d,", layer->out->tensor->dim[i]);
+			else 
+				NNOM_LOG("     ");
+		}
+		NNOM_LOG(")\n");
+		
+		if (layer->shortcut == NULL)
+			break;
+		layer = layer->shortcut;
+	}
+
+}
--- a/components/ai/nnom/src/layers/nnom_activation.c
+++ b/components/ai/nnom/src/layers/nnom_activation.c
@@ -0,0 +1,369 @@
+
+
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+#include <math.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_activation.h"
+
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+nnom_layer_t *Activation(nnom_activation_t *act)
+{
+	nnom_activation_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_activation_layer_t) + sizeof(nnom_layer_io_t) * 2;
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_activation_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_ACTIVATION;
+	layer->super.run = activation_run;
+	layer->super.build = default_build;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_NULL; // when a layer's io is set to NULL, both will point to same mem.
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+
+	// set activation to layer
+	layer->act = act;
+
+	// set free method
+	layer->super.free = activation_free;
+
+	return (nnom_layer_t *)layer;
+}
+
+nnom_layer_t *ReLU(void)
+{
+	nnom_layer_t *layer = Activation(act_relu());
+	if (layer == NULL)
+		return NULL;
+
+	// set type in layer parent
+	layer->type = NNOM_RELU;
+	return layer;
+}
+
+nnom_layer_t *LeakyReLU(float alpha)
+{
+	nnom_layer_t *layer = Activation(act_leaky_relu(alpha));
+	if (layer == NULL)
+		return NULL;
+
+	// set type in layer parent
+	layer->type = NNOM_LEAKY_RELU;
+	return layer;
+}
+
+nnom_layer_t *AdvReLU(float alpha, float max, float threshold)
+{
+	nnom_layer_t *layer = Activation(act_adv_relu(alpha, max, threshold));
+	if (layer == NULL)
+		return NULL;
+
+	// set type in layer parent
+	layer->type = NNOM_ADV_RELU;
+	return layer;
+}
+
+nnom_layer_t *Sigmoid(int32_t dec_bit)
+{
+	nnom_layer_t *layer = Activation(act_sigmoid(dec_bit));
+	if (layer == NULL)
+		return NULL;
+
+	// set type in layer parent
+	layer->type = NNOM_SIGMOID;
+	return layer;
+}
+
+nnom_layer_t *TanH(int32_t dec_bit)
+{
+	nnom_layer_t *layer = Activation(act_tanh(dec_bit));
+	if (layer == NULL)
+		return NULL;
+	// set type in layer parent
+	layer->type = NNOM_TANH;
+	return layer;
+}
+
+void act_delete(nnom_activation_t* act){
+	nnom_free(act);
+}
+
+// activation takes act instance which is created. therefore, it must be free when activation is deleted.
+// this is the callback in layer->free
+nnom_status_t activation_free(nnom_layer_t *layer)
+{
+	if(layer)
+		act_delete(((nnom_activation_layer_t *)layer)->act);
+	return NN_SUCCESS;
+}
+
+nnom_status_t activation_run(nnom_layer_t *layer)
+{
+	nnom_activation_layer_t *cl = (nnom_activation_layer_t *)layer;
+	return act_tensor_run(cl->act, layer->in->tensor);
+}
+
+// porting
+static nnom_status_t relu_run(nnom_activation_t* act)
+{
+    if(act->tensor->bitwidth == 16)
+    {
+    #ifdef NNOM_USING_CMSIS_NN
+        arm_relu_q15(act->tensor->p_data, tensor_size(act->tensor));
+    #else
+        local_relu_q15(act->tensor->p_data, tensor_size(act->tensor));
+    #endif
+    }
+    else
+    {
+    #ifdef NNOM_USING_CMSIS_NN
+        arm_relu_q7(act->tensor->p_data, tensor_size(act->tensor));
+    #else
+        local_relu_q7(act->tensor->p_data, tensor_size(act->tensor));
+    #endif
+    }
+	return NN_SUCCESS;
+}
+
+// leaky relu 
+static nnom_status_t leaky_relu_run(nnom_activation_t* act)
+{
+	nnom_activation_leaky_relu_t* a = (nnom_activation_leaky_relu_t*) act;
+    if(act->tensor->bitwidth == 16)
+        local_leaky_relu_q15(act->tensor->p_data, a->alpha, tensor_size(act->tensor));
+    else
+	    local_leaky_relu_q7(act->tensor->p_data, a->alpha, tensor_size(act->tensor));
+	return NN_SUCCESS;
+}
+
+// advance relu
+static nnom_status_t adv_relu_run(nnom_activation_t* act)
+{
+	nnom_activation_adv_relu_t* a = (nnom_activation_adv_relu_t*) act;
+
+	// we need to convert float to fixpoint in runtime where we can know the tensor's q format
+    if(act->tensor->bitwidth == 16)
+    {
+        q15_t max = 32767;
+        q15_t threshold = MIN(a->threshold * (1 << (15 - act->tensor->q_dec[0])), 32767);
+		q7_t max_scale = (1 << (15 - act->tensor->q_dec[0]));
+        if(a->max != INFINITY && a->max != 0x7fc00000) 
+            if(a->max * max_scale < max)
+                max = a->max * max_scale;
+        local_adv_relu_q15(act->tensor->p_data, a->negative_slope, max, threshold, tensor_size(act->tensor));
+    }
+    // 8bit
+    else
+    {
+        q7_t max = 127;
+        q7_t threshold = MIN(a->threshold * (1 << (7 - act->tensor->q_dec[0])), 127);
+		q7_t max_scale = (1 << (7 - act->tensor->q_dec[0]));
+        if(a->max != INFINITY && a->max != 0x7fc00000) // QNAN 0x7fc00000 also represent infinity in script 0.4.1
+            if(a->max * max_scale < max)
+                max = a->max * max_scale;
+        local_adv_relu_q7(act->tensor->p_data, a->negative_slope, max, threshold, tensor_size(act->tensor));
+    }
+    
+	return NN_SUCCESS;
+}
+
+static nnom_status_t tanh_run(nnom_activation_t* act)
+{
+	nnom_activation_fixed_q_t * a = (nnom_activation_fixed_q_t*)act;
+    // 16 bit
+    if(act->tensor->bitwidth == 16)
+    {   
+        uint8_t int_bit = 15 - a->dec_bit;
+        #ifdef NNOM_USING_CMSIS_NN
+        arm_nn_activations_direct_q15(act->tensor->p_data, tensor_size(act->tensor), int_bit, ARM_TANH);
+        #else
+        local_tanh_q15(act->tensor->p_data, tensor_size(act->tensor), int_bit);
+        #endif
+    }
+    else // 8bit
+    {
+        uint8_t int_bit = 7 - a->dec_bit;
+        // arm version cannot handle int_bit > 3
+    #ifdef NNOM_USING_CMSIS_NN
+        if(act->tensor->q_dec[0] <= 3) 
+            arm_nn_activations_direct_q7(act->tensor->p_data, tensor_size(act->tensor), int_bit, ARM_TANH);
+        else
+    #endif
+            local_tanh_q7(act->tensor->p_data, tensor_size(act->tensor), int_bit);
+    }
+	return NN_SUCCESS;
+}
+
+static nnom_status_t sigmoid_run( nnom_activation_t* act)
+{
+	nnom_activation_fixed_q_t * a = (nnom_activation_fixed_q_t*)act;
+    // 16 bit
+    if(act->tensor->bitwidth == 16)
+    {   
+        uint8_t int_bit = 15 - a->dec_bit;
+        #ifdef NNOM_USING_CMSIS_NN
+        arm_nn_activations_direct_q15(act->tensor->p_data, tensor_size(act->tensor), int_bit, ARM_SIGMOID);
+        #else
+        local_sigmoid_q15(act->tensor->p_data, tensor_size(act->tensor), int_bit);
+        #endif
+    }
+    else     // 8bit
+    {
+        uint8_t int_bit = 7 - a->dec_bit;
+        // arm version cannot handle int_bit > 3
+    #ifdef NNOM_USING_CMSIS_NN
+        if(act->tensor->q_dec[0] <= 3) 
+            arm_nn_activations_direct_q7(act->tensor->p_data, tensor_size(act->tensor), int_bit, ARM_TANH);
+        else
+    #endif
+            local_sigmoid_q7(act->tensor->p_data, tensor_size(act->tensor), int_bit);
+    }
+
+	return NN_SUCCESS;
+}
+
+static nnom_status_t hard_tanh_run( nnom_activation_t* act)
+{
+	nnom_activation_fixed_q_t * a = (nnom_activation_fixed_q_t*)act;
+    if(act->tensor->bitwidth == 16)
+        local_hard_tanh_q15(act->tensor->p_data, tensor_size(act->tensor), a->dec_bit + 8); // a->dec is based on 8 bit. 
+    else
+        local_hard_tanh_q7(act->tensor->p_data, tensor_size(act->tensor), a->dec_bit); 
+	return NN_SUCCESS;
+}
+
+static nnom_status_t hard_sigmoid_run( nnom_activation_t* act)
+{
+	nnom_activation_fixed_q_t * a = (nnom_activation_fixed_q_t*)act;
+    if(act->tensor->bitwidth == 16)
+        local_hard_sigmoid_q15(act->tensor->p_data, tensor_size(act->tensor), a->dec_bit + 8); // a->dec is based on 8 bit. 
+    else
+        local_hard_sigmoid_q7(act->tensor->p_data, tensor_size(act->tensor), a->dec_bit); 
+	return NN_SUCCESS;
+}
+
+//
+nnom_activation_t* act_relu(void)
+{
+	nnom_activation_t* act = nnom_mem(sizeof(nnom_activation_t));
+	act->run = relu_run;
+	act->type = ACT_RELU;
+	return act;
+}
+
+nnom_activation_t* act_leaky_relu(float alpha)
+{
+	nnom_activation_leaky_relu_t* act = nnom_mem(sizeof(nnom_activation_leaky_relu_t));
+	act->super.run = leaky_relu_run;
+	act->super.type = ACT_LEAKY_RELU;
+	act->alpha = (q7_t)(alpha*128);
+	return (nnom_activation_t* )act;
+}
+
+nnom_activation_t* act_adv_relu(float negative_slope, float max, float threshold)
+{
+	nnom_activation_adv_relu_t* act = nnom_mem(sizeof(nnom_activation_adv_relu_t));
+	act->super.run = adv_relu_run;
+	act->super.type = ACT_ADV_RELU;
+	act->negative_slope = (q7_t)(negative_slope*128);
+	act->max = max;
+	act->threshold = threshold;
+	return (nnom_activation_t* )act;
+}
+
+nnom_activation_t* act_tanh(int32_t dec_bit)
+{
+	nnom_activation_fixed_q_t* act = nnom_mem(sizeof(nnom_activation_fixed_q_t));
+	act->super.run = tanh_run;
+	act->super.type = ACT_TANH;
+	act->dec_bit = dec_bit;
+	return (nnom_activation_t*)act;
+}
+
+nnom_activation_t* act_sigmoid(int32_t dec_bit)
+{
+	nnom_activation_fixed_q_t* act = nnom_mem(sizeof(nnom_activation_fixed_q_t));
+
+	act->super.run = sigmoid_run;
+	act->super.type = ACT_SIGMOID;
+	act->dec_bit = dec_bit;
+	return (nnom_activation_t*)act;
+}
+
+nnom_activation_t* act_hard_tanh(int32_t dec_bit)
+{
+	nnom_activation_fixed_q_t* act = nnom_mem(sizeof(nnom_activation_fixed_q_t));
+
+	act->super.run = hard_tanh_run;
+	act->super.type = ACT_HARD_TANH;
+	act->dec_bit = dec_bit;
+	return (nnom_activation_t*)act;
+}
+
+nnom_activation_t* act_hard_sigmoid(int32_t dec_bit)
+{
+	nnom_activation_fixed_q_t* act = nnom_mem(sizeof(nnom_activation_fixed_q_t));
+
+	act->super.run = hard_sigmoid_run;
+	act->super.type = ACT_HARD_SIGMOID;
+	act->dec_bit = dec_bit;
+	return (nnom_activation_t*)act;
+}
+
+// return the decimal bit if the activation will change the q format of the layer. 
+int32_t act_get_dec_bit(nnom_activation_type_t type, int32_t dec_bit)
+{
+	switch(type)
+	{
+		case ACT_RELU:
+		case ACT_LEAKY_RELU:
+		case ACT_ADV_RELU:
+			break;
+		case ACT_TANH:
+        case ACT_HARD_TANH:
+		case ACT_SIGMOID:
+        case ACT_HARD_SIGMOID:
+			dec_bit = 7;
+		default:break;
+	}
+	return dec_bit;
+}
+
+// a direct api to run activate a tensor
+nnom_status_t act_tensor_run(nnom_activation_t* act, nnom_tensor_t* tensor)
+{
+	act->tensor = tensor;
+	return act->run(act);
+}
--- a/components/ai/nnom/src/layers/nnom_avgpool.c
+++ b/components/ai/nnom/src/layers/nnom_avgpool.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_avgpool.h"
+
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+nnom_layer_t *avgpool_s(const nnom_pool_config_t * config)
+{
+	nnom_avgpool_layer_t *cl;
+	
+	if(config->num_dim == 1)
+	{
+		cl = (nnom_avgpool_layer_t *)AvgPool(kernel(1, config->kernel_size[0]), 
+						stride(1, config->stride_size[0]),
+						config->padding_type);
+	}
+	else
+	{
+		cl = (nnom_avgpool_layer_t *)AvgPool(kernel(config->kernel_size[0], config->kernel_size[1]), 
+						stride(config->stride_size[0], config->stride_size[1]),
+						config->padding_type);
+	}
+	
+	if(cl)
+	{
+		cl->super.config = (void*) config;
+		cl->output_shift = config->output_shift; // no idea if we need it
+	}
+	return (nnom_layer_t *)cl;
+}
+
+nnom_layer_t *AvgPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad_type)
+{
+	nnom_layer_t *layer = MaxPool(k, s, pad_type);
+
+	if (layer != NULL)
+	{
+		layer->type = NNOM_AVGPOOL;
+		layer->run = avgpool_run;
+		layer->build = avgpool_build;
+	}
+	return (nnom_layer_t *)layer;
+}
+
+nnom_status_t avgpool_build(nnom_layer_t *layer)
+{
+	uint32_t size;
+	// avg pooling share the same output shape, stride, padding setting.
+	maxpool_build(layer);
+
+	#ifdef NNOM_USING_CMSIS_NN
+	// however, avg pooling require a computational buffer.
+	//  bufferA size:  2*dim_im_out*ch_im_in
+	size = layer->out->tensor->dim[1] > layer->out->tensor->dim[0] ?
+						layer->out->tensor->dim[1] : layer->out->tensor->dim[0];
+	layer->comp->size = 2 * size * layer->in->tensor->dim[2];
+	#endif
+
+	return NN_SUCCESS;
+}
+
+nnom_status_t avgpool_run(nnom_layer_t *layer)
+{
+	nnom_avgpool_layer_t *cl = (nnom_avgpool_layer_t *)(layer);
+	uint16_t out_x, out_y;
+	// if global pooling
+	if(layer->out->tensor->num_dim == 1)
+	{
+		out_x = 1; out_y = 1;
+	}
+	else // normal pooling. 
+	{
+		out_x = layer->out->tensor->dim[1]; //W
+		out_y = layer->out->tensor->dim[0]; //h
+	}
+
+    // 16 bit
+    if(layer->in->tensor->bitwidth == 16)
+    {
+#ifdef NNOM_USING_CHW
+	local_avepool_q15_CHW(layer->in->tensor->p_data, 				
+			layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+			cl->kernel.w, cl->kernel.h, 
+			cl->pad.w, cl->pad.h,
+			cl->stride.w, cl->stride.h,
+			out_x, out_y,
+			cl->output_shift,
+			NULL,
+			layer->out->tensor->p_data);
+#else
+    local_avepool_q15_HWC(layer->in->tensor->p_data, 				
+            layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+            cl->kernel.w, cl->kernel.h, 
+            cl->pad.w, cl->pad.h,
+            cl->stride.w, cl->stride.h,
+            out_x, out_y,
+            cl->output_shift,
+            NULL,
+            layer->out->tensor->p_data);
+#endif
+    }
+    // 8bit
+	else{
+#ifdef NNOM_USING_CHW
+	local_avepool_q7_CHW(layer->in->tensor->p_data, 				
+			layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+			cl->kernel.w, cl->kernel.h, 
+			cl->pad.w, cl->pad.h,
+			cl->stride.w, cl->stride.h,
+			out_x, out_y,
+			cl->output_shift,
+			NULL,
+			layer->out->tensor->p_data);
+#else //end of CHW
+	#ifdef NNOM_USING_CMSIS_NN
+	// 2D, square
+	if (layer->in->tensor->dim[1] == layer->in->tensor->dim[0] &&
+		layer->out->tensor->dim[1] == layer->out->tensor->dim[0] &&
+		cl->output_shift == 0)
+	{
+		arm_avepool_q7_HWC(
+			layer->in->tensor->p_data,
+			layer->in->tensor->dim[1], layer->in->tensor->dim[2],
+			cl->kernel.w, cl->pad.w, cl->stride.w,
+			layer->out->tensor->dim[1],
+			layer->comp->mem->blk,
+			layer->out->tensor->p_data);
+	}
+	// none square 2D, or 1D
+	else
+	#endif
+	{
+		// CMSIS-NN does not support none-square pooling, we have to use local implementation
+		local_avepool_q7_HWC(layer->in->tensor->p_data, 				
+				layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+				cl->kernel.w, cl->kernel.h, 
+				cl->pad.w, cl->pad.h,
+				cl->stride.w, cl->stride.h,
+				out_x, out_y,
+				cl->output_shift,
+				NULL,
+				layer->out->tensor->p_data);
+	}
+#endif
+    }
+	return NN_SUCCESS;
+}
--- a/components/ai/nnom/src/layers/nnom_baselayer.c
+++ b/components/ai/nnom/src/layers/nnom_baselayer.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_baselayer.h"
+
+// this layer copys the input to the output
+
+nnom_layer_t *baselayer_s(const nnom_layer_config_t * config)
+{
+	nnom_layer_t *layer = BaseLayer();
+	if(layer)
+		layer->config = (void*) config;
+	return layer;
+}
+
+nnom_layer_t *BaseLayer()
+{
+	nnom_io_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_io_layer_t) + sizeof(nnom_layer_io_t) * 2;
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_io_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_BASE;
+	layer->super.run = default_run;
+	layer->super.build = default_build;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_NULL;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+
+	return (nnom_layer_t *)layer;
+}
+
+// this is call while output shape is not defined.
+// this will set the output shape same as input shape, and it set only the primary IO
+// this cannot be used as first layer, of course...
+nnom_status_t default_build(nnom_layer_t *layer)
+{
+	// get the last layer's output as input shape
+	layer->in->tensor = layer->in->hook.io->tensor;
+	// output tensor
+	// 1. allocate a new tensor for output
+	// 2. set the same dim, qfmt to the new tensor.
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR,layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+	
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+
+	// now this build has passed the input tensors (shapes, formats) to the new tensors. 
+	return NN_SUCCESS;
+}
+
+// simply copy input to output
+nnom_status_t default_run(nnom_layer_t *layer)
+{
+	if(layer->out->type != NNOM_TENSOR_BUF_NULL)
+    {
+		nnom_memcpy(layer->out->tensor->p_data, layer->in->tensor->p_data, tensor_size_byte(layer->in->tensor)); 
+    }
+	return NN_SUCCESS;
+}
--- a/components/ai/nnom/src/layers/nnom_concat.c
+++ b/components/ai/nnom/src/layers/nnom_concat.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_concat.h"
+
+nnom_layer_t *concat_s(const nnom_concat_config_t *config)
+{
+	nnom_layer_t* layer = Concat(config->axis);
+	if(layer)
+		layer->config = (void*) config;
+	return layer;
+}
+
+// concate method
+// concate requires more than one input module. aux input will be allocated in model.merge()
+nnom_layer_t *Concat(int8_t axis)
+{
+	nnom_concat_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+	size_t mem_size;
+
+	// apply a block memory for all the sub handles.
+	mem_size = sizeof(nnom_concat_layer_t) + sizeof(nnom_layer_io_t) * 2;
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_concat_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_CONCAT;
+	layer->super.run = concat_run;
+	layer->super.build = concat_build;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	
+	// axis
+	layer->axis = axis; 
+
+	return (nnom_layer_t *)layer;
+}
+
+
+nnom_status_t concat_build(nnom_layer_t *layer)
+{
+	nnom_concat_layer_t *cl = (nnom_concat_layer_t *)layer;
+	nnom_layer_io_t *in;
+	uint32_t in_num = 0;
+	int32_t num_dim;
+
+	// for each input module, copy the shape from the output of last layer
+	in = layer->in;
+	while (in != NULL)
+	{
+		//get the last layer's output as input shape
+		in->tensor = in->hook.io->tensor;
+		in = in->aux;
+		in_num++;
+	}
+	
+	// allocate new tensor for output, keep the same dimension lenght
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+	
+	// convert the axis. 
+	if (cl->axis < 0)
+		cl->axis = (layer->in->tensor->num_dim + cl->axis);
+	else if (cl->axis >0)
+		cl->axis = cl->axis -1; // keras use axis start from 1. we are using 0, 1, 2 (check?)
+
+	// find out the concated axis
+	num_dim = layer->in->tensor->num_dim;
+	for (uint32_t i = 0; i < num_dim; i ++)
+	{
+		// exclue the concat axies
+		if (i == cl->axis)
+		{
+			layer->out->tensor->dim[i] = 0;
+
+			// add the same axis from all input up. 
+			in = layer->in;
+			while (in != NULL)
+			{
+				layer->out->tensor->dim[i] += in->tensor->dim[i];
+				in = in->aux;
+			}
+			continue;
+		}
+
+		// check others, all other must be same shape
+		in = layer->in;
+		while (in != NULL && in->aux != NULL)
+		{
+			if (in->tensor->dim[i] != in->aux->tensor->dim[i])
+				return NN_ARGUMENT_ERROR;
+			in = in->aux;
+		}
+
+		// now set other axis
+		layer->out->tensor->dim[i] = layer->in->tensor->dim[i];
+	}
+
+	return NN_SUCCESS;
+}
+
+
+#ifdef NNOM_USING_CHW
+// axis index converter between HWC and CHW
+static inline int chw_i(int hwc, int num_dim)
+{
+    num_dim = num_dim -1;
+	hwc = hwc + 1;			
+	if(hwc>num_dim) 
+		hwc = 0;
+	return hwc;
+}
+static inline int hwc_i(int chw, int num_dim)
+{
+    num_dim = num_dim -1;
+	chw = chw - 1;			
+	if(chw<num_dim) 
+		chw = num_dim;
+	return chw;
+}
+#endif
+
+nnom_status_t concat_run(nnom_layer_t *layer)
+{
+	// by default, concat layer has mutiple (>=2) input and 1 output.
+	nnom_concat_layer_t *cl = (nnom_concat_layer_t *)layer;
+	nnom_layer_io_t *in;
+    uint32_t dwidth = layer->in->tensor->bitwidth/8; // data width in byte
+
+#ifdef NNOM_USING_CHW
+	// Concatenate for HWC	
+	uint8_t *pin;
+	uint8_t *pout = layer->out->tensor->p_data;
+	uint32_t block_size;
+	uint32_t n_block;
+	uint8_t num_dim = layer->in->tensor->num_dim;
+	
+	// calcualte number of block to concat. the other shapes before the concat axis
+	n_block = 1;
+	for(int i= 0; i< chw_i(cl->axis, num_dim); i++)
+	{
+		n_block *= layer->in->tensor->dim[hwc_i(i, num_dim)];
+	}
+	
+	// concat all input layers
+	for(int i=0; i<n_block; i++)
+	{
+		in = layer->in;
+		while (in != NULL)
+		{
+			// the block size of concat data in this layer
+			block_size = dwidth;
+			for(int j= num_dim-1; j >= chw_i(cl->axis, num_dim); j--)
+				block_size *= in->tensor->dim[hwc_i(j, num_dim)];
+			// concat		
+			pin = (uint8_t *)in->tensor->p_data + i * block_size;
+			nnom_memcpy(pout, pin, block_size);
+			pout += block_size;
+			in = in->aux;
+		}
+	}
+	
+#else // end of CHW concate
+
+	// Concatenate for HWC	
+	uint8_t* pin;
+	uint8_t* pout = layer->out->tensor->p_data;
+	uint32_t block_size;
+	uint32_t n_block;
+	uint8_t num_dim = layer->in->tensor->num_dim;
+
+	// calcualte the number of block to concat. (the other shapes before the concat axis)
+	n_block = 1;
+	for (int i = 0; i < cl->axis; i++)
+		n_block *= layer->in->tensor->dim[i];
+
+	// concat all input layers
+	for (int i = 0; i < n_block; i++)
+	{
+		in = layer->in;
+		while (in != NULL)
+		{
+			// the block size of concat data in this layer
+			block_size = dwidth;
+			for (int j = cl->axis; j < num_dim; j++)
+				block_size *= in->tensor->dim[j];
+			// concat		
+			pin = (uint8_t*)in->tensor->p_data + i * block_size;
+			nnom_memcpy(pout, pin, block_size);
+			pout += block_size;
+			in = in->aux;
+		}
+	}
+#endif
+	return NN_SUCCESS;
+}
+
--- a/components/ai/nnom/src/layers/nnom_conv2d.c
+++ b/components/ai/nnom/src/layers/nnom_conv2d.c
@@ -0,0 +1,434 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_conv2d.h"
+
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+// a machine friendly api, with suffix _s for structured configuration.  
+nnom_layer_t *conv2d_s(const nnom_conv2d_config_t *config)
+{
+	nnom_conv2d_layer_t *layer;
+	nnom_buf_t *comp;
+	nnom_layer_io_t *in, *out;
+	size_t mem_size;
+
+	// allocate a block memory for all the sub handles and shifts.
+	mem_size = sizeof(nnom_conv2d_layer_t) + sizeof(nnom_layer_io_t) * 2 + sizeof(nnom_buf_t);
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+	
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_conv2d_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+	comp = (void *)((uint8_t*)out + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_CONV_2D;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	comp->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	#ifdef NNOM_USING_CMSIS_NN
+	layer->super.comp = comp;
+	#endif
+	// set run method & output shape
+	layer->super.run = conv2d_run;
+	layer->super.build = conv2d_build;
+	layer->super.free = conv2d_free;
+
+	// save the config
+	layer->super.config = (void*) config;
+
+	// get the private parameters
+	// test: for 1d input, expend h = 1
+	if(config->weight->num_dim == 3)
+	{
+		layer->kernel = kernel(1, config->kernel_size[0]);
+		layer->stride = stride(1, config->stride_size[0]);
+		layer->dilation = dilation(1, config->dilation_size[0]);
+	}
+	else
+	{
+		layer->kernel = kernel(config->kernel_size[0], config->kernel_size[1]);
+		layer->stride = stride(config->stride_size[0], config->stride_size[1]);
+		layer->dilation = dilation(config->dilation_size[0], config->dilation_size[1]);
+	}
+
+	layer->filter_mult = config->filter_size; // for convs, this means filter number
+	layer->padding_type = config->padding_type;
+
+	// get bias and weight tensor, this should be created by script. 
+	layer->weight = config->weight;
+	layer->bias = config->bias;
+	
+	// get shifts
+	layer->output_rshift = (nnom_qformat_param_t *)config->output_shift;
+	layer->bias_lshift = (nnom_qformat_param_t *)config->bias_shift;
+
+	// padding
+	if (layer->padding_type == PADDING_SAME)
+	{
+		layer->pad.h = layer->dilation.h * (layer->kernel.h - 1) / 2;
+		layer->pad.w = layer->dilation.w * (layer->kernel.w - 1) / 2;
+		layer->pad.c = (1 - 1) / 2;
+	}
+
+	return (nnom_layer_t *)layer;
+}
+
+
+// Conv2D
+// multiplier of (output/input channel),
+// shape of kernal, shape of strides, weight struct, bias struct
+nnom_layer_t *Conv2D(uint32_t filters, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d,  nnom_padding_t pad_type,
+					 const nnom_weight_t *w, const nnom_bias_t *b)
+{
+	nnom_conv2d_layer_t *layer;
+	nnom_buf_t *comp;
+	nnom_layer_io_t *in, *out;
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_conv2d_layer_t) + sizeof(nnom_layer_io_t) * 2 + sizeof(nnom_buf_t);
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_conv2d_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+	comp = (void *)((uint8_t*)out + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_CONV_2D;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	comp->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	#ifdef NNOM_USING_CMSIS_NN
+	layer->super.comp = comp;
+	#endif
+	// set run method & output shape
+	layer->super.run = conv2d_run;
+	layer->super.build = conv2d_build;
+
+	// get the private parameters
+	layer->kernel = k;
+	layer->stride = s;
+	layer->dilation = d; 	
+	layer->filter_mult = filters; 		// for convs, this means filter number
+	layer->padding_type = pad_type;
+
+	// create weight and bias tensor
+	layer->weight = new_tensor(NNOM_QTYPE_PER_TENSOR, 4, filters);
+	layer->bias = new_tensor(NNOM_QTYPE_PER_TENSOR, 1, filters);
+
+	// configure weight tensor manually to support new tensor based backends. 
+	// needs to be very careful
+	{
+		// config weight 
+		nnom_shape_data_t dim[4] = {k.h, k.w, k.c, filters};
+		*(layer->weight->q_offset) = 0;			// we have no support of offset here
+		*(layer->weight->q_dec) = 0;		// not using it
+		layer->weight->p_data = (void*)w->p_value;
+		layer->weight->bitwidth = 8;
+		layer->weight->qtype = NNOM_QTYPE_PER_TENSOR;
+		nnom_memcpy(layer->weight->dim, dim, layer->weight->num_dim * sizeof(nnom_shape_data_t));
+
+		// config bias 
+		dim[0] = filters;
+		*(layer->bias->q_offset) = 0;			// we have no support of offset here
+		*(layer->bias->q_dec) = 0;		// not using it
+		layer->bias->p_data = (void*) b->p_value;
+		layer->bias->bitwidth = 8;
+		layer->weight->qtype = NNOM_QTYPE_PER_TENSOR;
+		nnom_memcpy(layer->bias->dim, dim, layer->bias->num_dim * sizeof(nnom_shape_data_t));
+		
+		// output shift and bias shift
+		layer->output_rshift = (nnom_qformat_param_t *)&w->shift;
+		layer->bias_lshift = (nnom_qformat_param_t *)&b->shift;
+	}
+
+	return (nnom_layer_t *)layer;
+}
+
+// keras's implementation. 
+// source: https://github.com/keras-team/keras/blob/7a39b6c62d43c25472b2c2476bd2a8983ae4f682/keras/utils/conv_utils.py#L85
+uint32_t conv_output_length(uint32_t input_length, uint32_t filter_size, nnom_padding_t padding, uint32_t stride, uint32_t dilation)
+{
+    if (input_length == 0)
+        return 0;
+    uint32_t dilated_filter_size = (filter_size - 1) * dilation + 1;
+	uint32_t output_length;
+    if(padding == PADDING_SAME)
+        output_length = input_length;
+    else
+        output_length = input_length - dilated_filter_size + 1;
+    return (output_length + stride - 1) / stride;
+}
+
+nnom_status_t conv2d_build(nnom_layer_t *layer)
+{
+	nnom_conv2d_layer_t *cl = (nnom_conv2d_layer_t *)layer;
+
+	// get the tensor from last layer's output
+	layer->in->tensor = layer->in->hook.io->tensor;
+
+	// create new tensor for the output
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, cl->filter_mult);
+	// copy then change later. 
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+	
+	// calculate the output tensor q format, only support per tensor quantise now
+	layer->out->tensor->q_dec[0] = layer->in->tensor->q_dec[0] + cl->weight->q_dec[0] - cl->output_rshift[0]; // need some modification for 16bit. 
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+	
+	// now we set up the tensor shape, always HWC format
+	layer->out->tensor->dim[0] = conv_output_length(layer->in->tensor->dim[0], cl->kernel.h, cl->padding_type, cl->stride.h, cl->dilation.h);
+	layer->out->tensor->dim[1] = conv_output_length(layer->in->tensor->dim[1], cl->kernel.w, cl->padding_type, cl->stride.w, cl->dilation.w);
+	layer->out->tensor->dim[2] = cl->filter_mult; // channel stays the same
+	
+	// fill padding
+	if (cl->padding_type == PADDING_SAME)
+	{
+		cl->pad.w = cl->dilation.w * (cl->kernel.w - 1) / 2;
+		cl->pad.h = cl->dilation.h * (cl->kernel.h - 1) / 2;
+		cl->pad.c = 0;
+	}
+
+	#ifdef NNOM_USING_CMSIS_NN
+	// bufferA size: (1D shape)
+	// 2*ch_im_in*dim_kernel*dim_kernel
+	layer->comp->size = 2 * 2 * layer->in->tensor->dim[2] * cl->kernel.w * cl->kernel.h;
+	#endif
+	// computational cost: K x K x Cin x Hour x Wout x Cout
+	layer->stat.macc = cl->kernel.w * cl->kernel.h * layer->in->tensor->dim[2] * tensor_size(layer->out->tensor);
+	return NN_SUCCESS;
+}
+
+nnom_status_t conv2d_free(nnom_layer_t *layer)
+{
+	// free weight and bias tensor when we are not initialised from structured configuration. 
+	if(!layer->config)
+	{
+		nnom_conv2d_layer_t* cl = (nnom_conv2d_layer_t*)layer;
+		delete_tensor(cl->weight);
+		delete_tensor(cl->bias);
+	}
+	return NN_SUCCESS;
+}
+
+
+nnom_status_t conv2d_run(nnom_layer_t *layer)
+{
+	nnom_conv2d_layer_t *cl = (nnom_conv2d_layer_t *)layer;
+
+#ifdef NNOM_USING_CHW
+    // CHW format
+    if(layer->in->tensor->bitwidth == 16) 
+    	local_convolve_CHW_q15_nonsquare(
+				layer->in->tensor->p_data,
+				layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+				cl->weight->p_data, layer->out->tensor->dim[2],
+				cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, cl->dilation.w, cl->dilation.h,
+				cl->bias->p_data, cl->bias_lshift, cl->output_rshift, cl->weight->qtype,
+				layer->out->tensor->p_data,
+				layer->out->tensor->dim[1], layer->out->tensor->dim[0], NULL, NULL);
+    else
+        local_convolve_CHW_q7_nonsquare(
+                    layer->in->tensor->p_data,
+                    layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+                    cl->weight->p_data, layer->out->tensor->dim[2],
+                    cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, cl->dilation.w, cl->dilation.h,
+                    cl->bias->p_data, cl->bias_lshift, cl->output_rshift, cl->weight->qtype,
+                    layer->out->tensor->p_data,
+                    layer->out->tensor->dim[1], layer->out->tensor->dim[0], NULL, NULL);
+	return NN_SUCCESS;
+#else
+	// HWC format
+	#ifdef NNOM_USING_CMSIS_NN
+	// current cmsis nn does not support dilation
+	if(cl->dilation.w == 1 && cl->dilation.h == 1 && cl->weight->qtype == NNOM_QTYPE_PER_TENSOR)
+    {   
+        // 8 bit cmsis nn
+        if(layer->in->tensor->bitwidth == 8)
+        {
+            //RGB
+            // ch_im_in = 3, w = h
+            if (layer->in->tensor->dim[2] == 3 && layer->in->tensor->dim[0] == layer->in->tensor->dim[1])
+                // squared
+                if((cl->kernel.w == cl->kernel.h) && (cl->pad.w == cl->pad.h) && (cl->stride.w == cl->stride.h))
+                    return (nnom_status_t)arm_convolve_HWC_q7_RGB(
+                        layer->in->tensor->p_data, layer->in->tensor->dim[1], layer->in->tensor->dim[2],
+                        cl->weight->p_data,
+                        layer->out->tensor->dim[2],
+                        cl->kernel.w, cl->pad.w, cl->stride.w,
+                        cl->bias->p_data, cl->bias_lshift[0],
+                        cl->output_rshift[0], layer->out->tensor->p_data, layer->out->tensor->dim[1],
+                        (q15_t *)(layer->comp->mem->blk), NULL);
+
+            // check if can use optimized function
+            //	ch_im_in is multiple of 4
+            //	ch_im_out is multiple of 2
+            if ((layer->in->tensor->dim[2] % 4 == 0) && (layer->out->tensor->dim[2] % 2 == 0))
+            {
+                // squared
+                if((layer->in->tensor->dim[0] == layer->in->tensor->dim[1])
+                && (layer->out->tensor->dim[0] == layer->out->tensor->dim[1])
+                && (cl->kernel.w == cl->kernel.h) && (cl->pad.w == cl->pad.h) && (cl->stride.w == cl->stride.h))
+                {
+                    // 1x1 fast
+                    if (cl->kernel.w == 1 && cl->kernel.h == 1 && cl->stride.w == 1 && cl->stride.h == 1 && cl->pad.w == 0 && cl->pad.h == 0)
+                        return (nnom_status_t)arm_convolve_1x1_HWC_q7_fast_nonsquare(
+                            layer->in->tensor->p_data,
+                            layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+                            cl->weight->p_data,
+                            layer->out->tensor->dim[2],
+                            cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h,
+                            cl->bias->p_data, cl->bias_lshift[0],
+                            cl->output_rshift[0], layer->out->tensor->p_data, layer->out->tensor->dim[1], layer->out->tensor->dim[0],
+                            (q15_t *)(layer->comp->mem->blk), NULL);
+                    // opt square shape
+                    else
+                        return (nnom_status_t)arm_convolve_HWC_q7_fast(
+                            layer->in->tensor->p_data, layer->in->tensor->dim[1], layer->in->tensor->dim[2],
+                            cl->weight->p_data,
+                            layer->out->tensor->dim[2], cl->kernel.w, cl->pad.w, cl->stride.w,
+                            cl->bias->p_data, cl->bias_lshift[0],
+                            cl->output_rshift[0], layer->out->tensor->p_data,
+                            layer->out->tensor->dim[1], (q15_t *)(layer->comp->mem->blk), NULL);
+                }
+                // opt none square shape
+                else
+                    return (nnom_status_t)arm_convolve_HWC_q7_fast_nonsquare(
+                        layer->in->tensor->p_data,
+                        layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+                        cl->weight->p_data, layer->out->tensor->dim[2],
+                        cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h,
+                        cl->bias->p_data, cl->bias_lshift[0], cl->output_rshift[0],
+                        layer->out->tensor->p_data,
+                        layer->out->tensor->dim[1], layer->out->tensor->dim[0], (q15_t *)(layer->comp->mem->blk), NULL);
+            }
+            // none optimized
+            else
+            {
+                // none opt square shape
+                if ((layer->in->tensor->dim[0] == layer->in->tensor->dim[1] && 
+                    layer->out->tensor->dim[0] == layer->out->tensor->dim[1]) &&
+                    (cl->kernel.w == cl->kernel.h) && (cl->pad.w == cl->pad.h) && (cl->stride.w == cl->stride.h))
+                    return (nnom_status_t)arm_convolve_HWC_q7_basic(
+                        layer->in->tensor->p_data, layer->in->tensor->dim[1], layer->in->tensor->dim[2],
+                        cl->weight->p_data,
+                        layer->out->tensor->dim[2], cl->kernel.w, cl->pad.w, cl->stride.w,
+                        cl->bias->p_data, cl->bias_lshift[0],
+                        cl->output_rshift[0], layer->out->tensor->p_data,
+                        layer->out->tensor->dim[1], (q15_t *)(layer->comp->mem->blk), NULL);
+                // none opt none square shape
+                else
+                    return (nnom_status_t)arm_convolve_HWC_q7_basic_nonsquare(
+                        layer->in->tensor->p_data,
+                        layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+                        cl->weight->p_data, layer->out->tensor->dim[2],
+                        cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h,
+                        cl->bias->p_data, cl->bias_lshift[0], cl->output_rshift[0],
+                        layer->out->tensor->p_data,
+                        layer->out->tensor->dim[1], layer->out->tensor->dim[0], (q15_t *)(layer->comp->mem->blk), NULL);
+            } //end of cmsis-nn none-opt
+        }  //end of 8 bit cmsis-nn
+        else if (layer->in->tensor->bitwidth == 16)
+        {
+            // fast opt
+            if ((layer->in->tensor->dim[2] % 2 == 0) && (layer->out->tensor->dim[2] % 2 == 0))
+            {
+                if((layer->in->tensor->dim[0] == layer->in->tensor->dim[1])
+                    && (layer->out->tensor->dim[0] == layer->out->tensor->dim[1])
+                    && (cl->kernel.w == cl->kernel.h) && (cl->pad.w == cl->pad.h) && (cl->stride.w == cl->stride.h))
+                    return (nnom_status_t)arm_convolve_HWC_q15_fast(
+                        layer->in->tensor->p_data, layer->in->tensor->dim[1], layer->in->tensor->dim[2],
+                        cl->weight->p_data,
+                        layer->out->tensor->dim[2], cl->kernel.w, cl->pad.w, cl->stride.w,
+                        cl->bias->p_data, cl->bias_lshift[0],
+                        cl->output_rshift[0], layer->out->tensor->p_data,
+                        layer->out->tensor->dim[1], (q15_t *)(layer->comp->mem->blk), NULL);
+                else 
+                    return (nnom_status_t)arm_convolve_HWC_q15_fast_nonsquare(
+                        layer->in->tensor->p_data,
+                        layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+                        cl->weight->p_data, layer->out->tensor->dim[2],
+                        cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h,
+                        cl->bias->p_data, cl->bias_lshift[0], cl->output_rshift[0],
+                        layer->out->tensor->p_data,
+                        layer->out->tensor->dim[1], layer->out->tensor->dim[0], (q15_t *)(layer->comp->mem->blk), NULL);
+            }
+            // none opt basic
+            else
+            {
+				local_convolve_HWC_q7_nonsquare(
+					layer->in->tensor->p_data,
+					layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+					cl->weight->p_data, layer->out->tensor->dim[2],
+					cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, cl->dilation.w, cl->dilation.h,
+					cl->bias->p_data, cl->bias_lshift, cl->output_rshift, cl->weight->qtype,
+					layer->out->tensor->p_data,
+					layer->out->tensor->dim[1], layer->out->tensor->dim[0], NULL, NULL);
+				return NN_SUCCESS;
+            }
+
+        } // end of 16 bit cmsis-nn
+    } // end of dilation == 1
+	else
+	#endif // NNOM_USING_CMSIS_NN
+	{
+
+        if(layer->in->tensor->bitwidth == 16) 
+    	local_convolve_HWC_q15_nonsquare(
+				layer->in->tensor->p_data,
+				layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+				cl->weight->p_data, layer->out->tensor->dim[2],
+				cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, cl->dilation.w, cl->dilation.h,
+				cl->bias->p_data, cl->bias_lshift, cl->output_rshift, cl->weight->qtype,
+				layer->out->tensor->p_data,
+				layer->out->tensor->dim[1], layer->out->tensor->dim[0], NULL, NULL);
+        else
+		local_convolve_HWC_q7_nonsquare(
+					layer->in->tensor->p_data,
+					layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+					cl->weight->p_data, layer->out->tensor->dim[2],
+					cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, cl->dilation.w, cl->dilation.h,
+					cl->bias->p_data, cl->bias_lshift, cl->output_rshift, cl->weight->qtype,
+					layer->out->tensor->p_data,
+					layer->out->tensor->dim[1], layer->out->tensor->dim[0], NULL, NULL);
+		return NN_SUCCESS;
+	}
+#endif // end of CHW/HWC
+	return NN_SUCCESS;
+}
+
--- a/components/ai/nnom/src/layers/nnom_conv2d_trans.c
+++ b/components/ai/nnom/src/layers/nnom_conv2d_trans.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-05-31     Jianjia Ma   The first version
+ */
+
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_conv2d_trans.h"
+
+nnom_layer_t *conv2d_trans_s(const nnom_conv2d_config_t *config)
+{
+	nnom_layer_t *layer;
+	layer = conv2d_s(config);
+	if (layer)
+	{
+		layer->type = NNOM_CONV2D_TRANS;
+		layer->run = conv2d_trans_run;
+		layer->build = conv2d_trans_build;
+	}
+	return layer;
+}
+
+nnom_layer_t *Conv2DTrans(uint32_t multiplier, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d, nnom_padding_t pad_type,
+						const nnom_weight_t *w, const nnom_bias_t *b)
+{
+	nnom_layer_t *layer = Conv2D(multiplier, k, s, d, pad_type, w, b); 
+	if (layer != NULL)
+	{
+		layer->type = NNOM_CONV2D_TRANS;
+		layer->run = conv2d_trans_run;
+		layer->build = conv2d_trans_build;
+	}
+	return layer;
+}
+
+// utils, keras method
+// https://github.com/keras-team/keras/blob/7a39b6c62d43c25472b2c2476bd2a8983ae4f682/keras/utils/conv_utils.py#L114
+// https://github.com/tensorflow/tensorflow/blob/2b96f3662bd776e277f86997659e61046b56c315/tensorflow/python/layers/utils.py#L156
+uint32_t conv_trans_output_length(uint32_t input_length, uint32_t kernel_size, nnom_padding_t padding, uint32_t stride_size, uint32_t dilation)
+{
+	input_length *= stride_size;
+	if (padding == PADDING_VALID)
+		input_length += MAX(kernel_size - stride_size, 0);
+	return input_length;
+}
+
+nnom_status_t conv2d_trans_build(nnom_layer_t *layer)
+{
+	nnom_conv2d_trans_layer_t *cl = (nnom_conv2d_trans_layer_t *)layer;
+
+	// get the tensor from last layer's output
+	layer->in->tensor = layer->in->hook.io->tensor;
+
+	// create new tensor for the output
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, cl->filter_mult);
+	// copy then change later. 
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+
+	// calculate the output tensor q format, only support per tensor quantise now
+	layer->out->tensor->q_dec[0] = layer->in->tensor->q_dec[0] + cl->weight->q_dec[0] - cl->output_rshift[0];
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+
+	// now we set up the tensor shape, always HWC format
+	layer->out->tensor->dim[0] = conv_trans_output_length(layer->in->tensor->dim[0], cl->kernel.h, cl->padding_type, cl->stride.h, cl->dilation.h);
+	layer->out->tensor->dim[1] = conv_trans_output_length(layer->in->tensor->dim[1], cl->kernel.w, cl->padding_type, cl->stride.w, cl->dilation.w);
+	layer->out->tensor->dim[2] = cl->filter_mult; // channel stays the same
+	
+	// fill the correct padding
+	if(cl->padding_type == PADDING_SAME)
+	{			
+		cl->pad.h = (cl->kernel.h - cl->stride.h) / 2; // the padding to the output. 
+		cl->pad.w = (cl->kernel.w - cl->stride.w) / 2;
+//		cl->pad.h = (cl->kernel.h - 1)/2; // the padding to the output. 
+//		cl->pad.w = (cl->kernel.w - 1)/2;
+		cl->pad.c = 0;
+	}
+	else
+	{
+		cl->pad.h = 0;
+		cl->pad.w = 0;
+		cl->pad.c = 0;
+	}
+
+	// bufferA size: (1D shape)
+	// 2*ch_im_in*dim_kernel*dim_kernel
+	//layer->comp->size = 2 * 2 * layer->in->tensor->dim[2] * cl->kernel.w * cl->kernel.h;
+	// computational cost: K x K x Cin x Hour x Wout x Cout
+	layer->stat.macc = cl->kernel.w * cl->kernel.h * layer->in->tensor->dim[2] * tensor_size(layer->out->tensor);
+	return NN_SUCCESS;
+}
+
+
+nnom_status_t conv2d_trans_run(nnom_layer_t *layer)
+{
+    nnom_conv2d_trans_layer_t *cl = (nnom_conv2d_trans_layer_t *)layer;
+
+#ifdef NNOM_USING_CHW
+	// no support for CHW yet
+	return NN_ARGUMENT_ERROR;
+#else	
+
+	//return conv2d_run(layer);
+	
+	local_conv_trans_HWC_q7_nonsquare(
+				layer->in->tensor->p_data,
+				layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+				cl->weight->p_data, layer->out->tensor->dim[2],
+				cl->kernel.w, cl->kernel.h, cl->pad.w, cl->pad.h, cl->stride.w, cl->stride.h, cl->dilation.w, cl->dilation.h,
+				cl->bias->p_data, cl->bias_lshift[0], cl->output_rshift[0],
+				layer->out->tensor->p_data,
+				layer->out->tensor->dim[1], layer->out->tensor->dim[0], NULL, NULL);
+	return NN_SUCCESS;
+#endif
+}
+
+
--- a/components/ai/nnom/src/layers/nnom_cropping.c
+++ b/components/ai/nnom/src/layers/nnom_cropping.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_cropping.h"
+
+nnom_layer_t * cropping_s(const nnom_cropping_config_t *config)
+{
+	nnom_layer_t *layer = Cropping(config->pad);
+	if(layer)
+		layer->config = (void*) config;
+	return layer;
+}
+
+// Cropping layer
+nnom_layer_t *Cropping(nnom_border_t pad)
+{
+	nnom_layer_t *layer;
+	// most setting are the same as zero padding
+	layer = ZeroPadding(pad);
+	
+	// now change to cropping
+	layer->type = NNOM_CROPPING;
+	layer->run = cropping_run;
+	layer->build = cropping_build;
+
+	return layer;
+}
+
+nnom_status_t cropping_build(nnom_layer_t* layer)
+{
+	nnom_cropping_layer_t *cl = (nnom_cropping_layer_t *)layer;
+
+	// get the tensor from last layer's output
+	layer->in->tensor = layer->in->hook.io->tensor;
+
+	// create new tensor for output
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
+	// copy then change later. 
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+	
+	// output shape
+	if(layer->in->tensor->dim[1] <= (cl->pad.left + cl->pad.right) || 
+		layer->in->tensor->dim[0] <= (cl->pad.top + cl->pad.bottom))
+		return NN_ARGUMENT_ERROR;
+	
+	layer->out->tensor->dim[0] = layer->in->tensor->dim[0] - (cl->pad.top + cl->pad.bottom);
+	layer->out->tensor->dim[1] = layer->in->tensor->dim[1] - (cl->pad.left + cl->pad.right);
+	layer->out->tensor->dim[2] = layer->in->tensor->dim[2];
+	return NN_SUCCESS;
+}
+
+
+nnom_status_t cropping_run(nnom_layer_t * layer)
+{
+	nnom_cropping_layer_t *cl = (nnom_cropping_layer_t*)layer;
+	
+#ifdef NNOM_USING_CHW
+	local_cropping_CHW_q7(
+#else
+	local_cropping_HWC_q7(
+#endif	
+						layer->in->tensor->p_data, 
+						layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+						cl->pad.top,
+						cl->pad.bottom,
+						cl->pad.left,
+						cl->pad.right,
+						layer->out->tensor->p_data,
+						layer->out->tensor->dim[1], layer->out->tensor->dim[0]);
+
+	return NN_SUCCESS;
+}
--- a/components/ai/nnom/src/layers/nnom_dense.c
+++ b/components/ai/nnom/src/layers/nnom_dense.c
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_dense.h"
+
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+nnom_layer_t *dense_s(const nnom_dense_config_t *config)
+{
+	nnom_dense_layer_t *layer;
+	nnom_buf_t *comp;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_dense_layer_t) + sizeof(nnom_layer_io_t) * 2 + sizeof(nnom_buf_t);
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_dense_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+	comp = (void *)((uint8_t*)out + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_DENSE;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	comp->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	layer->super.comp = comp;
+	// set run and outshape methods
+	layer->super.run = dense_run;
+	layer->super.build = dense_build;
+	layer->super.free = dense_free;
+
+	// set parameters
+	layer->output_unit = tensor_get_num_channel(config->weight); 
+	layer->bias = config->bias;
+	layer->weight = config->weight;
+	// set shifts
+	layer->output_rshift = (nnom_qformat_param_t *)config->output_shift;
+	layer->bias_lshift = (nnom_qformat_param_t *)config->bias_shift;
+	// set config
+	layer->super.config = (void*) config;
+
+	return (nnom_layer_t *)layer;
+}
+
+nnom_layer_t *Dense(size_t output_unit, const nnom_weight_t *w, const nnom_bias_t *b)
+{
+	nnom_dense_layer_t *layer;
+	nnom_buf_t *comp;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_dense_layer_t) + sizeof(nnom_layer_io_t) * 2 + sizeof(nnom_buf_t);
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_dense_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+	comp = (void *)((uint8_t*)out + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_DENSE;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	comp->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	layer->super.comp = comp;
+	// set run and outshape methods
+	layer->super.run = dense_run;
+	layer->super.build = dense_build;
+
+	// set parameters
+	layer->output_unit = output_unit; // this is no longer needed. the information is contained in the weight tensor. 
+
+	layer->weight = new_tensor(NNOM_QTYPE_PER_TENSOR, 2, output_unit);
+	layer->bias = new_tensor(NNOM_QTYPE_PER_TENSOR, 1, output_unit);
+
+	// configure weight tensor manually to support new tensor-based backends. 
+	// needs to be very careful
+	{
+		// config weight 
+		nnom_shape_data_t dim[2] = {0, output_unit}; // the first dim doesnt matter here. will be file in later. 
+		*(layer->weight->q_offset) = 0;			// we have no support of offset here
+		*(layer->weight->q_dec) = 0;		// this is not even correct
+		layer->weight->p_data = (void*)w->p_value;
+		layer->weight->bitwidth = 8;
+		layer->weight->qtype = NNOM_QTYPE_PER_TENSOR;
+		nnom_memcpy(layer->weight->dim, dim, layer->weight->num_dim * sizeof(nnom_shape_data_t));
+
+		// config bias 
+		dim[0] = output_unit;
+		*(layer->bias->q_offset) = 0;			// we have no support of offset here
+		*(layer->bias->q_dec) = 0;		// this is not even correct
+		layer->bias->p_data = (void*)b->p_value;
+		layer->bias->bitwidth = 8;
+		layer->weight->qtype = NNOM_QTYPE_PER_TENSOR;
+		nnom_memcpy(layer->bias->dim, dim, layer->bias->num_dim * sizeof(nnom_shape_data_t));
+	}
+
+	// set output shifts
+	layer->output_rshift = (nnom_qformat_param_t *)&w->shift;
+	layer->bias_lshift = (nnom_qformat_param_t *)&b->shift;
+
+	return (nnom_layer_t *)layer;
+}
+
+nnom_status_t dense_build(nnom_layer_t *layer)
+{
+	nnom_dense_layer_t *cl = (nnom_dense_layer_t *)layer;
+
+	// get the tensor from last layer's output
+	layer->in->tensor = layer->in->hook.io->tensor;
+
+	// create new tensor for output
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, 1, tensor_get_num_channel(layer->in->tensor));
+	// setup new tensor
+	nnom_shape_data_t dim[1] = {cl->output_unit};
+	tensor_set_attr(layer->out->tensor, cl->weight->q_dec, cl->weight->q_offset, dim, 1, 8); // test, this is not correct
+
+	// calculate the output tensor q format, only support per tensor quantise now
+	layer->out->tensor->q_dec[0] = layer->in->tensor->q_dec[0] + cl->weight->q_dec[0] - cl->output_rshift[0];
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+	
+	// vec_buffer size: dim_vec (*2, q7->q15) ? I am not sure this is right
+	layer->comp->size = tensor_size(layer->in->tensor)*2;
+
+	// computational cost: In * out
+	layer->stat.macc = tensor_size(layer->in->tensor) * tensor_size(layer->out->tensor);
+	return NN_SUCCESS;
+}
+
+nnom_status_t dense_free(nnom_layer_t *layer)
+{
+	// free weight and bias tensor when we are not initialised from structured configuration. 
+	if(!layer->config)
+	{
+		nnom_dense_layer_t* cl = (nnom_dense_layer_t*)layer;
+		delete_tensor(cl->weight);
+		delete_tensor(cl->bias);
+	}
+
+	return NN_SUCCESS;
+}
+
+nnom_status_t dense_run(nnom_layer_t *layer)
+{
+	nnom_status_t result = NN_SUCCESS;
+	nnom_dense_layer_t *cl = (nnom_dense_layer_t *)(layer);
+	nnom_qformat_param_t bias_shift = cl->bias_lshift[0];			// this is not correct but a temporary fix solution for backward compatibility.
+	nnom_qformat_param_t output_shift = cl->output_rshift[0];
+
+
+#if !(DENSE_WEIGHT_OPT)
+	#ifdef NNOM_USING_CMSIS_NN
+		result = (nnom_status_t)arm_fully_connected_q7(
+	#else
+		local_fully_connected_q7(
+	#endif
+#else
+	#ifdef NNOM_USING_CMSIS_NN
+		result = (nnom_status_t)arm_fully_connected_q7_opt(
+	#else
+		local_fully_connected_q7_opt(
+	#endif
+#endif
+			layer->in->tensor->p_data,
+			cl->weight->p_data,
+			tensor_size(layer->in->tensor), layer->out->tensor->dim[0],
+			bias_shift, output_shift,
+			cl->bias->p_data,
+			layer->out->tensor->p_data, (q15_t *)(layer->comp->mem->blk));
+	return result;
+}
+
--- a/components/ai/nnom/src/layers/nnom_dw_conv2d.c
+++ b/components/ai/nnom/src/layers/nnom_dw_conv2d.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_dw_conv2d.h"
+
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+nnom_layer_t *dw_conv2d_s(const nnom_conv2d_config_t *config)
+{
+	nnom_layer_t *layer;
+	layer = conv2d_s(config);
+	if (layer)
+	{
+		layer->type = NNOM_DW_CONV_2D;
+		layer->run = dw_conv2d_run;
+		layer->build = dw_conv2d_build;
+	}
+	return layer;
+}
+
+nnom_layer_t *DW_Conv2D(uint32_t multiplier, nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_3d_shape_t d, nnom_padding_t pad_type,
+						const nnom_weight_t *w, const nnom_bias_t *b)
+{
+	nnom_layer_t *layer = Conv2D(multiplier, k, s, d, pad_type, w, b); // passing multiplier in .
+	if (layer != NULL)
+	{
+		layer->type = NNOM_DW_CONV_2D;
+		layer->run = dw_conv2d_run;
+		layer->build = dw_conv2d_build;
+	}
+	return layer;
+}
+
+nnom_status_t dw_conv2d_build(nnom_layer_t *layer)
+{
+	nnom_conv2d_layer_t *cl = (nnom_conv2d_layer_t *)layer;
+
+	// get the tensor from last layer's output
+	layer->in->tensor = layer->in->hook.io->tensor;
+
+	// create new tensor for output
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor) * cl->filter_mult);
+	// copy then change later. 
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+
+	// calculate the output tensor q format, only support per tensor quantise now
+	layer->out->tensor->q_dec[0] = layer->in->tensor->q_dec[0] + cl->weight->q_dec[0] - cl->output_rshift[0];
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+
+	// now we set up the tensor shape, always HWC format
+	layer->out->tensor->dim[0] = conv_output_length(layer->in->tensor->dim[0], cl->kernel.h, cl->padding_type, cl->stride.h, cl->dilation.h);
+	layer->out->tensor->dim[1] = conv_output_length(layer->in->tensor->dim[1], cl->kernel.w, cl->padding_type, cl->stride.w, cl->dilation.w);
+	layer->out->tensor->dim[2] = layer->in->tensor->dim[2] * cl->filter_mult; // channel stays the same
+
+	// fill padding
+	if (cl->padding_type == PADDING_SAME)
+	{
+		cl->pad.w = cl->dilation.w * (cl->kernel.w - 1) / 2;
+		cl->pad.h = cl->dilation.h * (cl->kernel.h - 1) / 2;
+		cl->pad.c = 0;
+	}
+	
+	// bufferA size: 
+	#ifdef NNOM_USING_CMSIS_NN
+	layer->comp->size = 2 * 2 * (layer->in->tensor->dim[2] / cl->filter_mult) * cl->kernel.w * cl->kernel.h;
+	#endif
+
+	// computational cost: K x K x Cin x Hout x Wout x Multiplier
+	// or                : K x K x Cout x Hout x Wout
+	layer->stat.macc = cl->kernel.w * cl->kernel.h * tensor_size(layer->out->tensor);
+	return NN_SUCCESS;
+}
+
+nnom_status_t dw_conv2d_run(nnom_layer_t *layer)
+{
+	nnom_status_t result = NN_SUCCESS;
+	nnom_conv2d_layer_t *cl = (nnom_conv2d_layer_t *)layer;
+
+#ifndef NNOM_USING_CHW	
+	#ifdef NNOM_USING_CMSIS_NN
+	// Current CMSIS-NN does not support dilation
+	if(cl->dilation.w ==1 && cl->dilation.h == 1 && cl->weight->qtype == NNOM_QTYPE_PER_TENSOR && cl->filter_mult == 1)
+	{
+		// CMSIS-NN only support 1 mulplipier in depthwise conv
+		if (layer->in->tensor->dim[2] % 2 != 0 || layer->out->tensor->dim[2] % 2)
+			return NN_ARGUMENT_ERROR;
+		result = (nnom_status_t)arm_depthwise_separable_conv_HWC_q7_nonsquare(
+				layer->in->tensor->p_data,
+				layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+				cl->weight->p_data,
+				layer->out->tensor->dim[2],
+				cl->kernel.w, cl->kernel.h,
+				cl->pad.w, cl->pad.h,
+				cl->stride.w, cl->stride.h,
+				cl->bias->p_data,
+				cl->bias_lshift[0], cl->output_rshift[0],
+				layer->out->tensor->p_data,
+				layer->out->tensor->dim[1], layer->out->tensor->dim[0], (q15_t *)(layer->comp->mem->blk), NULL);
+	}
+	else
+	#endif
+	local_depthwise_separable_conv_HWC_q7_nonsquare(
+#else	
+	local_depthwise_separable_conv_CHW_q7_nonsquare(
+#endif
+		layer->in->tensor->p_data,
+		layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+		cl->weight->p_data,
+		layer->out->tensor->dim[2],
+		cl->kernel.w, cl->kernel.h,
+		cl->pad.w, cl->pad.h,
+		cl->stride.w, cl->stride.h,
+		cl->dilation.w, cl->dilation.h,
+		cl->bias->p_data,
+		cl->bias_lshift, cl->output_rshift, cl->weight->qtype,
+		layer->out->tensor->p_data,
+		layer->out->tensor->dim[1], layer->out->tensor->dim[0], NULL, NULL);
+	return result;
+}
--- a/components/ai/nnom/src/layers/nnom_flatten.c
+++ b/components/ai/nnom/src/layers/nnom_flatten.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_flatten.h"
+
+nnom_layer_t *flatten_s(const nnom_flatten_config_t *config)
+{
+	nnom_layer_t *layer = Flatten();
+	if(layer)
+		layer->config = (void*) config;
+	return layer;
+}
+
+nnom_layer_t *Flatten(void)
+{
+	nnom_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_layer_t) + sizeof(nnom_layer_io_t) * 2;
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->type = NNOM_FLATTEN;
+	layer->run = flatten_run;
+	layer->build = flatten_build;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	#ifdef NNOM_USING_CHW
+		out->type = NNOM_TENSOR_BUF_TEMP; // test for CHW format
+	#else
+		out->type = NNOM_TENSOR_BUF_NULL; 
+	#endif
+	// put in & out on the layer.
+	layer->in = io_init(layer, in);
+	layer->out = io_init(layer, out);
+
+	return layer;
+}
+
+nnom_status_t flatten_build(nnom_layer_t *layer)
+{ 
+	// get the tensor from last layer's output
+	layer->in->tensor = layer->in->hook.io->tensor;
+
+	// create new tensor for output
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
+	// setup new tensor
+	nnom_shape_data_t dim[1] = {tensor_size(layer->in->tensor)};
+	tensor_set_attr(layer->out->tensor, layer->in->tensor->q_dec, layer->in->tensor->q_offset, dim, 1, 8);
+
+	return NN_SUCCESS;
+}
+
+nnom_status_t flatten_run(nnom_layer_t *layer)
+{
+	#ifdef NNOM_USING_CHW
+	// CHW format must reorder to HWC for dense layer and all other 1D layer (?)
+	tensor_chw2hwc_q7(layer->out->tensor, layer->in->tensor);
+	#endif
+	return NN_SUCCESS;
+}
--- a/components/ai/nnom/src/layers/nnom_global_pool.c
+++ b/components/ai/nnom/src/layers/nnom_global_pool.c
@@ -0,0 +1,145 @@
+
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_global_pool.h"
+
+nnom_layer_t * global_maxpool_s(const nnom_global_pool_config_t *config)
+{
+	nnom_maxpool_layer_t * cl = (nnom_maxpool_layer_t *)GlobalMaxPool();
+	if(cl)
+	{
+		cl->super.config = (void*) config;
+		cl->output_shift = config->output_shift;
+	}
+	return (nnom_layer_t *)cl;
+}
+nnom_layer_t * global_avgpool_s(const nnom_global_pool_config_t *config)
+{
+	nnom_maxpool_layer_t * cl = (nnom_maxpool_layer_t *)GlobalAvgPool();
+	if(cl)
+	{
+		cl->super.config = (void*) config;
+		cl->output_shift = config->output_shift;
+	}
+	return (nnom_layer_t *)cl;
+}
+
+nnom_layer_t * global_sumpool_s(const nnom_global_pool_config_t *config)
+{
+	nnom_maxpool_layer_t * cl = (nnom_maxpool_layer_t *)GlobalSumPool();
+	if(cl)
+	{
+		cl->super.config = (void*) config;
+		cl->output_shift = config->output_shift;
+	}
+	return (nnom_layer_t *)cl;
+}
+
+
+nnom_layer_t *GlobalMaxPool(void)
+{
+	// create the normal pooling layer, the parameters are left empty to fill in later.
+	// parameters will be filled in in global_pooling_build()
+	nnom_layer_t *layer = MaxPool(kernel(0, 0), stride(0, 0), PADDING_VALID);
+
+	// change to global max pool
+	if (layer != NULL)
+	{
+		layer->type = NNOM_GLOBAL_MAXPOOL;
+		layer->build = global_pool_build;
+	}
+
+	return (nnom_layer_t *)layer;
+}
+
+nnom_layer_t *GlobalAvgPool(void)
+{
+	// create the normal pooling layer, the parameters are left empty to fill in later.
+	// parameters will be filled in global_pooling_build() remotely
+	nnom_layer_t *layer = MaxPool(kernel(0, 0), stride(0, 0), PADDING_VALID);
+
+	// change some parameters to be recognised as avg pooling
+	if (layer != NULL)
+	{
+		layer->type = NNOM_GLOBAL_AVGPOOL;
+		layer->run = avgpool_run; // global and basic pooling share the same runner
+		layer->build = global_pool_build;
+	}
+
+	return (nnom_layer_t *)layer;
+}
+
+nnom_layer_t *GlobalSumPool(void)
+{
+	// create the normal pooling layer, the parameters are left empty to fill in later.
+	// parameters will be filled in global_pooling_build() remotely
+	nnom_layer_t *layer = MaxPool(kernel(0, 0), stride(0, 0), PADDING_VALID);
+
+	// change some parameters to be recognised as avg pooling
+	if (layer != NULL)
+	{
+		layer->type = NNOM_GLOBAL_SUMPOOL;
+		layer->run = sumpool_run; // global and basic pooling share the same runner
+		layer->build = global_pool_build;
+	}
+
+	return (nnom_layer_t *)layer;
+}
+
+nnom_status_t global_pool_build(nnom_layer_t *layer)
+{
+	nnom_maxpool_layer_t *cl = (nnom_maxpool_layer_t *)layer;
+
+	// get the tensor from last layer's output
+	layer->in->tensor = layer->in->hook.io->tensor;
+
+	// create new tensor for output
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, 1, tensor_get_num_channel(layer->in->tensor));
+
+	nnom_shape_data_t dim[1] = {tensor_get_num_channel(layer->in->tensor)}; // fill the first 2 dim later
+	tensor_set_attr_v(layer->out->tensor, layer->in->tensor->q_dec[0], 0, dim, sizeof(dim)/sizeof(nnom_shape_data_t), 8); 
+	
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+
+	// different from other *_build(), the kernel..padding left by layer API needs to be set in here
+	// due to the *_run() methods of global pooling are using the normall pooling's.
+	// fill in the parameters left by layer APIs (GlobalAvgPool and MaxAvgPool)
+	cl->kernel = shape(layer->in->tensor->dim[0], layer->in->tensor->dim[1], 1);
+	cl->stride = shape(1, 1, 1);
+	cl->pad = shape(0, 0, 0);
+	cl->padding_type = PADDING_VALID;
+
+	// additionally, avg pooling require computational buffer, which is  2*dim_im_out*ch_im_in
+	if (layer->type == NNOM_AVGPOOL || layer->type == NNOM_GLOBAL_AVGPOOL)
+	{
+		//  bufferA size:  2*dim_im_out*ch_im_in
+		layer->comp->size = 2 * layer->out->tensor->dim[0] * layer->in->tensor->dim[2];
+	}
+	
+	// additional for sumpool
+	if (layer->type == NNOM_SUMPOOL || layer->type == NNOM_GLOBAL_SUMPOOL)
+		layer->comp->size = 4 * tensor_size(layer->out->tensor);
+
+	return NN_SUCCESS;
+}
+
+
--- a/components/ai/nnom/src/layers/nnom_gru_cell.c
+++ b/components/ai/nnom/src/layers/nnom_gru_cell.c
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-08-24     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_gru_cell.h"
+
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+nnom_rnn_cell_t *gru_cell_s(const nnom_gru_cell_config_t* config)
+{
+	nnom_gru_cell_t *cell;
+	cell = nnom_mem(sizeof(nnom_gru_cell_t));
+	if (cell == NULL)
+		return NULL;
+	// set methods
+	cell->super.run = gru_cell_run;
+	cell->super.build = gru_cell_build;
+	cell->super.free = gru_cell_free;
+	cell->super.config = (void*) config;
+	cell->super.units = config->units;
+    cell->super.type = NNOM_GRU_CELL;
+
+	// set parameters 
+	cell->bias = config->bias;
+	cell->weights = config->weights;
+	cell->recurrent_weights = config->recurrent_weights;
+
+    // q format for intermediate calculation
+    cell->q_dec_h = config->q_dec_h;
+    cell->q_dec_z = config->q_dec_z;
+
+	return (nnom_rnn_cell_t *)cell;
+}
+
+nnom_status_t gru_cell_free(nnom_rnn_cell_t* cell)
+{
+	return NN_SUCCESS;
+}
+
+// the state buffer and computational buffer shape of the cell
+nnom_status_t gru_cell_build(nnom_rnn_cell_t* cell)
+{
+	nnom_layer_t *layer = cell->layer; 
+	nnom_gru_cell_t *c = (nnom_gru_cell_t *)cell;
+
+	// calculate output shift for the 2 calculation. 
+	// hw = the product of hidden x weight, iw = the product of input x weight
+	// due to the addition of them, they must have same q format.
+    // that is -> c->q_dec_z; 
+
+	// for the dots in cell: output shift = input_dec + weight_dec - output_dec
+	c->oshift_hw = c->q_dec_h + c->recurrent_weights->q_dec[0] - c->q_dec_z;
+	c->oshift_iw = layer->in->tensor->q_dec[0] + c->weights->q_dec[0] - c->q_dec_z; 
+	
+	// bias shift =  bias_dec - out_dec
+	c->bias_shift = layer->in->tensor->q_dec[0] + c->weights->q_dec[0] - c->bias->q_dec[0];
+
+	// state size = one timestamp output size. 
+	cell->state_size = cell->units * 2; // Q15
+
+	// comp buffer size: not required
+	cell->comp_buf_size = cell->units * (3*3) * 2 + cell->feature_size * 2; //q15 + input q7->q15 buffer.  
+
+	// finally, calculate the MAC for info for each timestamp
+	cell->macc = cell->feature_size * cell->units *3 // input: feature * state * 3 gates
+				+ cell->units * cell->units *8 // recurrent, state *  output_unit * (5 gate + 3 mult)
+				+ cell->units * (3 + 3 + 5);  // 3 gates, 3 mult, 5  addition
+
+	return NN_SUCCESS;
+}
+
+
+// keras implementation as below. 
+/*
+  def step(cell_inputs, cell_states):
+    """Step function that will be used by Keras RNN backend."""
+    h_tm1 = cell_states[0]
+
+    # inputs projected by all gate matrices at once
+    matrix_x = K.dot(cell_inputs, kernel)
+    matrix_x = K.bias_add(matrix_x, input_bias)
+
+    x_z, x_r, x_h = array_ops.split(matrix_x, 3, axis=1)
+
+    # hidden state projected by all gate matrices at once
+    matrix_inner = K.dot(h_tm1, recurrent_kernel)
+    matrix_inner = K.bias_add(matrix_inner, recurrent_bias)
+
+    recurrent_z, recurrent_r, recurrent_h = array_ops.split(matrix_inner, 3,
+                                                            axis=1)
+    z = nn.sigmoid(x_z + recurrent_z)
+    r = nn.sigmoid(x_r + recurrent_r)
+    hh = nn.tanh(x_h + r * recurrent_h)
+
+    # previous and candidate state mixed by update gate
+    h = z * h_tm1 + (1 - z) * hh
+    return h, [h]
+*/
+
+//
+nnom_status_t gru_cell_run(nnom_rnn_cell_t* cell)
+{
+	nnom_layer_t *layer = cell->layer;
+	nnom_gru_cell_t* c = (nnom_gru_cell_t*) cell;
+    int act_int_bit = 7 - c->q_dec_z;
+	// gate data    
+    q15_t* x_z, *x_r, *x_h;
+    q15_t* recurrent_z, *recurrent_r, *recurrent_h;
+	q15_t* temp[3];
+
+    // bias
+    q7_t* bias = (q7_t*)c->bias->p_data;
+    q7_t* recurrent_bias = (q7_t*)c->bias->p_data + cell->units*3;
+
+    // state buffer
+    q15_t* h_tm1 = (q15_t*)cell->in_state;
+    q15_t* h_t = (q15_t*)cell->out_state;
+
+    // computing buffer
+    // low |-- buf0 --|-- buf1 --|-- buf2 --|-- input_q15 --|
+    q15_t *buf[3];
+    buf[0] = (q15_t*)layer->comp->mem->blk;
+    buf[1] = (q15_t*)layer->comp->mem->blk + cell->units*3;
+    buf[2] = (q15_t*)layer->comp->mem->blk + cell->units*6;
+    q15_t *in_q15_buf = (q15_t*)layer->comp->mem->blk + cell->units*9;
+
+    // input q7 cast to q15
+    local_q7_to_q15(cell->in_data, in_q15_buf, cell->feature_size);
+
+    // matrix_x = K.dot(cell_inputs, kernel) + bias  --> buf0
+	#ifdef NNOM_USING_CMSIS_NN
+		arm_fully_connected_mat_q7_vec_q15_opt
+	#else
+		local_fully_connected_mat_q7_vec_q15_opt
+	#endif 
+		(in_q15_buf, c->weights->p_data, cell->feature_size, 
+			cell->units*3, c->bias_shift + 8, c->oshift_iw, bias, buf[0], NULL);
+
+    // matrix_intter = K.dot(h_tm1, recurrent_kernel) + bias -> buf1
+    #ifdef NNOM_USING_CMSIS_NN
+		arm_fully_connected_mat_q7_vec_q15_opt
+	#else
+		local_fully_connected_mat_q7_vec_q15_opt
+	#endif 
+		(h_tm1, c->recurrent_weights->p_data, cell->units, 
+			cell->units*3,  c->bias_shift + 8, c->oshift_hw, recurrent_bias, buf[1], NULL); 
+
+	// split to each gate
+    x_z = buf[0];
+    x_r = buf[0] + cell->units;
+    x_h = buf[0] + cell->units*2;
+    recurrent_z = buf[1];
+    recurrent_r = buf[1] + cell->units;
+    recurrent_h = buf[1] + cell->units*2;
+	// buffers
+    temp[0] = buf[2];
+    temp[1] = buf[2] + cell->units;
+    temp[2] = buf[2] + cell->units*2;
+
+    /* z = nn.sigmoid(x_z + recurrent_z) */
+    // 1.  z1 = x_z + recurrent_z    --->  temp[0]
+    local_add_q15(x_z, recurrent_z, temp[0], 0, cell->units);
+    // 2.  z = sigmoid(z1)
+    local_sigmoid_q15(temp[0], cell->units, act_int_bit);
+
+    /* r = nn.sigmoid(x_r + recurrent_r) */
+    // 1.  r1 = x_r + recurrent_r    --->  temp[1]
+    local_add_q15(x_r, recurrent_r, temp[1], 0, cell->units);
+    // 2.  r = sigmoid(r1)
+    local_sigmoid_q15(temp[1], cell->units, act_int_bit);
+
+    /* hh = nn.tanh(x_h + r * recurrent_h) */
+    // 1.  hh1 = r * recurrent_h     ---> temp[2]
+    local_mult_q15(temp[1], recurrent_h, temp[2], 15, cell->units);
+    // 2.  hh2 = x_h + hh1            ---> temp[1]
+    local_add_q15(x_h, temp[2], temp[1], 0, cell->units);
+    // 3.  hh = tanh(h2)           ---> temp[1]
+    local_tanh_q15(temp[1], cell->units, act_int_bit);
+
+    /* h = z * h_tm1 + (1 - z) * hh  */
+    // 1. h1 = z*h_tm1   ---> temp[2]
+    local_mult_q15(temp[0], h_tm1, temp[2], 15, cell->units);
+    // 2. h2 = 1 - z            ---> h_t state buff
+    local_1_minor_z_q15(temp[0], h_t, 15, cell->units);
+    // 3. h3 = h2 * hh          ---> temp[0]
+    local_mult_q15(h_t, temp[1],  temp[0], 15, cell->units);
+    // h = h1 + h3
+    local_add_q15(temp[2], temp[0], h_t, 0, cell->units);
+
+    // finally, copy and convert state to output
+    local_q15_to_q7(h_t, cell->out_data, 8, cell->units);
+	return NN_SUCCESS;
+}
+
+
+// Researve for debugging, printing the intermediate variables/data.
+#if 0
+// delete after testing completed
+static void print_variable_q15(q15_t *data,char*name, int dec_bit, int size)
+{
+	printf("\n\n");
+	printf("%s", name);
+	for(int i = 0; i < size; i++)
+	{
+		if(i%8==0)
+			printf("\n");
+		printf("%f\t", (float) data[i] / (1 << dec_bit));
+	}
+	printf("\n");
+}
+
+//
+nnom_status_t gru_cell_run(nnom_rnn_cell_t* cell)
+{
+	nnom_layer_t *layer = cell->layer;
+	nnom_gru_cell_t* c = (nnom_gru_cell_t*) cell;
+    int act_int_bit = 7 - c->q_dec_z;
+	// gate data    
+    q15_t* x_z, *x_r, *x_h;
+    q15_t* recurrent_z, *recurrent_r, *recurrent_h;
+	q15_t* temp[3];
+	
+    	 		// test
+	 			//nnom_memset(cell->in_data, 5 * (1<<layer->in->tensor->q_dec[0]), cell->feature_size); 
+
+    // bias
+    q7_t* bias = (q7_t*)c->bias->p_data;
+    q7_t* recurrent_bias = (q7_t*)c->bias->p_data + cell->units*3;
+
+    // state buffer
+    q15_t* h_tm1 = (q15_t*)cell->in_state;
+    q15_t* h_t = (q15_t*)cell->out_state;
+
+    // computing buffer
+    // low |-- buf0 --|-- buf1 --|-- buf2 --|-- input_q15 --|
+    q15_t *buf[3];
+    buf[0] = (q15_t*)layer->comp->mem->blk;
+    buf[1] = (q15_t*)layer->comp->mem->blk + cell->units*3;
+    buf[2] = (q15_t*)layer->comp->mem->blk + cell->units*6;
+    q15_t *in_q15_buf = (q15_t*)layer->comp->mem->blk + cell->units*9;
+
+    // input q7 cast to q15
+    local_q7_to_q15(cell->in_data, in_q15_buf, cell->feature_size);
+
+    // matrix_x = K.dot(cell_inputs, kernel) + bias  --> buf0
+	#ifdef NNOM_USING_CMSIS_NN
+		arm_fully_connected_mat_q7_vec_q15_opt
+	#else
+		local_fully_connected_mat_q7_vec_q15_opt
+	#endif 
+		(in_q15_buf, c->weights->p_data, cell->feature_size, 
+			cell->units*3, c->bias_shift + 8, c->oshift_iw, bias, buf[0], NULL);
+
+    // matrix_intter = K.dot(h_tm1, recurrent_kernel) + bias -> buf1
+    #ifdef NNOM_USING_CMSIS_NN
+		arm_fully_connected_mat_q7_vec_q15_opt
+	#else
+		local_fully_connected_mat_q7_vec_q15_opt
+	#endif 
+		(h_tm1, c->recurrent_weights->p_data, cell->units, 
+			cell->units*3,  c->bias_shift + 8, c->oshift_hw, recurrent_bias, buf[1], NULL); 
+			
+			print_variable_q15(in_q15_buf, "input", layer->in->tensor->q_dec[0]+8, cell->feature_size);
+			print_variable_q15(buf[0], "matrix_x", c->q_dec_z+8, cell->units*3);
+			print_variable_q15(buf[1], "matrix_recurrent", 	c->q_dec_z+8, cell->units*3);
+
+	// split to each gate
+    x_z = buf[0];
+    x_r = buf[0] + cell->units;
+    x_h = buf[0] + cell->units*2;
+    recurrent_z = buf[1];
+    recurrent_r = buf[1] + cell->units;
+    recurrent_h = buf[1] + cell->units*2;
+	// buffers
+    temp[0] = buf[2];
+    temp[1] = buf[2] + cell->units;
+    temp[2] = buf[2] + cell->units*2;
+
+    // z = nn.sigmoid(x_z + recurrent_z) 
+    // 1.  z1 = x_z + recurrent_z    --->  temp[0]
+    local_add_q15(x_z, recurrent_z, temp[0], 0, cell->units);
+    // 2.  z = sigmoid(z1)
+    local_sigmoid_q15(temp[0], cell->units, act_int_bit);
+		print_variable_q15(temp[0], "z", 15, cell->units);
+
+    // r = nn.sigmoid(x_r + recurrent_r) 
+    // 1.  r1 = x_r + recurrent_r    --->  temp[1]
+    local_add_q15(x_r, recurrent_r, temp[1], 0, cell->units);
+    // 2.  r = sigmoid(r1)
+    local_sigmoid_q15(temp[1], cell->units, act_int_bit);
+		print_variable_q15(temp[1], "r", 15, cell->units);
+
+    // hh = nn.tanh(x_h + r * recurrent_h) 
+    // 1.  hh1 = r * recurrent_h     ---> temp[2]
+    local_mult_q15(temp[1], recurrent_h, temp[2], 15, cell->units);
+    // 2.  hh2 = x_h + h1            ---> temp[1]
+    local_add_q15(x_h, temp[2], temp[1], 0, cell->units);
+    // 3.  hh = tanh(h2)           ---> temp[1]
+    local_tanh_q15(temp[1], cell->units, act_int_bit);
+		print_variable_q15(temp[1], "hh", 15, cell->units);
+
+    // h = z * h_tm1 + (1 - z) * hh  
+    // 1. h1 = z*h_tm1   ---> temp[2]
+    local_mult_q15(temp[0], h_tm1, temp[2], 15, cell->units);
+		print_variable_q15( temp[2], "h1", 15, cell->units);
+    // 2. h2 = 1 - z            ---> h_t state buff
+    local_1_minor_z_q15(temp[0], h_t, 15, cell->units);
+		print_variable_q15( h_t, "h2", 15, cell->units);
+    // 3. h3 = h2 * hh          ---> temp[0]
+    local_mult_q15(h_t, temp[1],  temp[0], 15, cell->units);
+		print_variable_q15( temp[0], "h3", 15, cell->units);
+    // h = h1 + h3
+    local_add_q15(temp[2], temp[0], h_t, 0, cell->units);
+		print_variable_q15(h_t, "h", 15, cell->units);
+
+    // finally, copy and convert state to output
+    local_q15_to_q7(h_t, cell->out_data, 8, cell->units);
+	return NN_SUCCESS;
+}
+#endif
--- a/components/ai/nnom/src/layers/nnom_input.c
+++ b/components/ai/nnom/src/layers/nnom_input.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_input.h"
+
+nnom_layer_t *input_s(const nnom_io_config_t* config)
+{
+	nnom_io_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+	// apply a block memory for all the sub handles.
+	layer = nnom_mem(sizeof(nnom_io_layer_t) + sizeof(nnom_layer_io_t) * 2);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_io_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_INPUT;
+	layer->super.run = input_run;
+	layer->super.build = input_build;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_NULL;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+
+	/*
+	// some other layers (Conv, pooling) are not supporting 12 d input, we still expand the 1,2 dimension to 3
+    // test -> native support 1,2,3 D input. 
+	layer->super.in->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, config->tensor->num_dim, tensor_get_num_channel(config->tensor));
+	tensor_cpy_attr(layer->super.in->tensor, config->tensor);
+	layer->buf = config->tensor->p_data;
+	layer->dec_bit = config->tensor->q_dec[0];
+	*/
+
+	// set parameters
+    if(config->tensor->num_dim == 1) // test for 1d input, expend h = 1
+        layer->shape = shape(1, 1, config->tensor->dim[0]);
+    else if (config->tensor->num_dim == 2) // test for 1d input, expend h = 1
+		layer->shape = shape(1, config->tensor->dim[0], config->tensor->dim[1]);
+	else
+		layer->shape = shape(config->tensor->dim[0], config->tensor->dim[1], config->tensor->dim[2]);
+	layer->buf = config->tensor->p_data;
+	layer->dec_bit = config->tensor->q_dec[0];
+
+	// experimental: fixed input dim to 3
+	// input normally dont have a tensor, so we create one to store the initial data. 
+	nnom_shape_data_t dim[3] = {layer->shape.h, layer->shape.w, layer->shape.c};
+	layer->super.in->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, 3, tensor_get_num_channel(config->tensor));
+	tensor_set_attr_v(layer->super.in->tensor, layer->dec_bit, 0, dim, sizeof(dim)/sizeof(nnom_shape_data_t), 8);
+	return (nnom_layer_t *)layer;
+}
+
+nnom_layer_t *Input(nnom_3d_shape_t input_shape, void *p_buf)
+{
+	nnom_io_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	layer = nnom_mem(sizeof(nnom_io_layer_t) + sizeof(nnom_layer_io_t) * 2);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_io_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_INPUT;
+	layer->super.run = input_run;
+	layer->super.build = input_build;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_NULL;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+
+	// set parameters
+	layer->shape = input_shape;
+	layer->buf = p_buf;
+	layer->dec_bit = 7;
+
+	// experimental: fixed input dim to 3
+	// input normally dont have a tensor, so we create one to store the initial data. 
+	nnom_shape_data_t dim[3] = { input_shape.h, input_shape.w, input_shape.c };
+	layer->super.in->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, 3, input_shape.c);
+	tensor_set_attr_v(layer->super.in->tensor, layer->dec_bit, 0, dim, sizeof(dim)/sizeof(nnom_shape_data_t), 8);
+	return (nnom_layer_t *)layer;
+}
+
+nnom_status_t input_build(nnom_layer_t* layer)
+{
+	// the input tensor of inputlayer has assigned previously 
+
+	// output tensor
+	// 1. allocate a new tensor for output
+	// 2. set the same dim, qfmt to the new tensor.
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+
+	// now this build has passed the input tensors (shapes, formats) to the new tensors. 
+	return NN_SUCCESS;
+}
+
+
+nnom_status_t input_run(nnom_layer_t *layer)
+{
+	nnom_io_layer_t *cl = (nnom_io_layer_t *)layer;
+#ifdef NNOM_USING_CHW
+	if(layer->in->tensor->num_dim == 3)
+    {
+        nnom_3d_shape_t shape = {layer->in->tensor->dim[0], layer->in->tensor->dim[1], layer->in->tensor->dim[2]};
+        hwc2chw_q7(shape, cl->buf, layer->in->tensor->p_data);
+    }
+    else if (layer->in->tensor->num_dim == 2)
+    {
+        nnom_3d_shape_t shape = {1, layer->in->tensor->dim[0], layer->in->tensor->dim[1]};
+        hwc2chw_q7(shape, cl->buf, layer->in->tensor->p_data);
+    }
+    else
+#endif
+	nnom_memcpy(layer->in->tensor->p_data, cl->buf, tensor_size(layer->in->tensor));
+
+	return NN_SUCCESS;
+}
--- a/components/ai/nnom/src/layers/nnom_lambda.c
+++ b/components/ai/nnom/src/layers/nnom_lambda.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_lambda.h"
+
+nnom_layer_t *lambda_s(const nnom_lambda_config_t * config)
+{
+	nnom_lambda_layer_t *cl = (nnom_lambda_layer_t *)Lambda(
+		config->run_func_name, 
+		config->build_func_name,
+		config->free_func_name,
+		config->parameters);
+	if(cl)
+		cl->super.config = (void*) config;
+	return (nnom_layer_t *)cl;
+}
+
+// TODO: extended to multiple IO layer
+nnom_layer_t *Lambda(nnom_status_t (*run)(nnom_layer_t *),
+					 nnom_status_t (*build)(nnom_layer_t *),
+					 nnom_status_t (*free)(nnom_layer_t *),
+					 void *parameters)
+{
+	nnom_lambda_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_io_layer_t) + sizeof(nnom_layer_io_t) * 2;
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_lambda_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+
+	// set buf type.
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+
+	// set io modules to the layer
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	// layer type
+	layer->super.type = NNOM_LAMBDA;
+
+	// user parameters
+	layer->parameters = parameters;
+
+	// free method
+	layer->super.free = free;
+
+	// output shape method. pass NULL in will use the default outshape method, which set the output shape same as input shape.
+	if (build == NULL)
+		layer->super.build = default_build;
+	else
+		layer->super.build = build;
+	// run method. default_run() will simply copy data from input tensor to output tensor. 
+	if(run == NULL)
+		layer->super.run = default_run;
+	else
+		layer->super.run = run;
+
+	return (nnom_layer_t *)layer;
+}
--- a/components/ai/nnom/src/layers/nnom_lstm_cell.c
+++ b/components/ai/nnom/src/layers/nnom_lstm_cell.c
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-08-24     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_lstm_cell.h"
+
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+// LSTM RNN
+// unit = output shape
+// type of activation
+nnom_rnn_cell_t *lstm_cell_s(const nnom_lstm_cell_config_t* config)
+{
+	nnom_lstm_cell_t *cell;
+	cell = nnom_mem(sizeof(nnom_lstm_cell_t));
+	if (cell == NULL)
+		return NULL;
+	// set methods
+	cell->super.run = lstm_cell_q7_q15_run;
+	cell->super.build = lstm_cell_q7_q15_build;
+	cell->super.free = lstm_cell_free;
+	cell->super.config = (void*) config;
+	cell->super.units = config->units;
+    cell->super.type = NNOM_LSTM_CELL;
+
+	// set parameters 
+	cell->bias = config->bias;
+	cell->weights = config->weights;
+	cell->recurrent_weights = config->recurrent_weights;
+
+    // q format for intermediate calculation
+    cell->q_dec_c = config->q_dec_c;
+    cell->q_dec_h = config->q_dec_h;
+    cell->q_dec_z = config->q_dec_z;
+
+	return (nnom_rnn_cell_t *)cell;
+}
+
+nnom_status_t lstm_cell_free(nnom_rnn_cell_t* cell)
+{
+	return NN_SUCCESS;
+}
+
+// keras implementation as below. 
+/*
+  def step(cell_inputs, cell_states):
+    """Step function that will be used by Keras RNN backend."""
+    h_tm1 = cell_states[0]  # previous memory state
+    c_tm1 = cell_states[1]  # previous carry state
+
+    z = K.dot(cell_inputs, kernel)          -> q_iw
+    z += K.dot(h_tm1, recurrent_kernel)     -> q_hw
+    z = K.bias_add(z, bias)                 
+
+    z0, z1, z2, z3 = array_ops.split(z, 4, axis=1)
+
+    i = nn.sigmoid(z0)
+    f = nn.sigmoid(z1)
+    c = f * c_tm1 + i * nn.tanh(z2)
+    o = nn.sigmoid(z3)
+
+    h = o * nn.tanh(c)
+    return h, [h, c]
+*/
+
+
+
+// the state buffer and computational buffer shape of the cell
+nnom_status_t lstm_cell_q7_q15_build(nnom_rnn_cell_t* cell)
+{
+	nnom_layer_t *layer = cell->layer; 
+	nnom_lstm_cell_t *c = (nnom_lstm_cell_t *)cell;
+
+	// calculate output shift for the 2 calculation. 
+	// hw = the product of hidden x weight, iw = the product of input x weight
+	// due to the addition of them, they must have same q format.
+    // that is -> c->q_dec_z; 
+
+	// for the dots in cell: output shift = input_dec + weight_dec - output_dec
+	c->oshift_hw = c->q_dec_h + c->recurrent_weights->q_dec[0] - c->q_dec_z;
+	c->oshift_iw = layer->in->tensor->q_dec[0] + c->weights->q_dec[0] - c->q_dec_z; 
+
+	// bias shift =  bias_dec - out_dec
+	c->bias_shift = layer->in->tensor->q_dec[0] + c->weights->q_dec[0] - c->bias->q_dec[0];
+
+	// state size = one timestamp output size. 
+	cell->state_size = cell->units * 2 * 2; // Q15
+
+	// // comp buffer size: not required
+	cell->comp_buf_size = cell->units * 12 * 2 + cell->feature_size * 2; //q15 + input q7->q15 buffer.  
+
+	// finally, calculate the MAC for info (for each timestamp)
+	cell->macc = cell->feature_size * cell->units *4    // input: feature *  state * 4 gates
+				+ cell->units * cell->units *4		    // recurrent, state
+				+ cell->units  *10;                     // output_unit * (5 gate + 3 mult + 2 addition)
+
+	return NN_SUCCESS;
+}
+
+// Q7 input output 
+// Q7 weights
+// Q15 states and intermediate buffer
+nnom_status_t lstm_cell_q7_q15_run(nnom_rnn_cell_t* cell)
+{
+	nnom_layer_t *layer = cell->layer;
+	nnom_lstm_cell_t* c = (nnom_lstm_cell_t*) cell;
+    int act_int_bit = 7 - c->q_dec_z;
+
+    // state buffer
+    // low |-- hidden --|-- carry --| high
+    q15_t* h_tm1 = (q15_t*)cell->in_state;
+    q15_t* c_tm1 = (q15_t*)cell->in_state + cell->units;
+    q15_t* o_state[2];
+    o_state[0] = (q15_t*)cell->out_state;
+    o_state[1] = (q15_t*)cell->out_state + cell->units;
+
+    // computing buffer
+    // low |-- buf0 --|-- buf1 --|-- buf2 --|-- input q15 --|
+    q15_t* z[4];
+    q15_t *buf0, *buf1, *buf2, *in_q15_buf;
+    buf0 = (q15_t*)layer->comp->mem->blk;
+    buf1 = (q15_t*)layer->comp->mem->blk + cell->units*4;
+    buf2 = (q15_t*)layer->comp->mem->blk + cell->units*8;
+    in_q15_buf = (q15_t*)layer->comp->mem->blk + cell->units*12;
+
+    // input q7 -> q15
+    local_q7_to_q15(cell->in_data, in_q15_buf, cell->feature_size);
+
+    // z1 = K.dot(cell_inputs, kernel) + bias -> buf1
+	#ifdef NNOM_USING_CMSIS_NN
+		arm_fully_connected_mat_q7_vec_q15_opt
+	#else
+		local_fully_connected_mat_q7_vec_q15_opt
+	#endif 
+		(in_q15_buf, c->weights->p_data, cell->feature_size, cell->units*4, c->bias_shift + 8, c->oshift_iw, c->bias->p_data, buf1, NULL);
+
+    // z2 = K.dot(h_tm1, recurrent_kernel)  -> buf2
+	// --- arm version must use bias, so we have to use local implementation
+    local_fully_connected_mat_q7_vec_q15_opt(h_tm1, c->recurrent_weights->p_data, 
+            cell->units, cell->units*4, 0, c->oshift_hw, NULL, buf2, NULL); 
+
+    // z = z1 + z2  -> buf0
+    local_add_q15(buf1, buf2, buf0, 0, cell->units*4);
+
+    // split the data to each gate
+    z[0] = buf0;
+    z[1] = buf0 + cell->units;
+    z[2] = buf0 + cell->units*2;
+    z[3] = buf0 + cell->units*3;
+
+    // i = nn.sigmoid(z0)
+    local_sigmoid_q15(z[0], cell->units, act_int_bit);
+    // f = nn.sigmoid(z1)
+    local_sigmoid_q15(z[1], cell->units, act_int_bit);
+    // o = nn.sigmoid(z3)
+    local_sigmoid_q15(z[3], cell->units, act_int_bit);
+
+    /* c = f * c_tm1 + i * nn.tanh(z2) for the step 1-3. */
+    // 1. i * tanh(z2) -> buf1
+    local_tanh_q15(z[2], cell->units, act_int_bit);
+    local_mult_q15(z[0], z[2], buf1, 30 - (c->q_dec_c+8), cell->units); 
+    // 2. f * c_tm1 -> o_state[0] 
+    local_mult_q15(z[1], c_tm1, o_state[0], 15, cell->units);
+    // 3. c = i*tanh + f*c_tm1 -> o_state[1]   ** fill the upper state (carry)
+    local_add_q15(buf1, o_state[0], o_state[1], 0, cell->units);
+
+    /* h = o * nn.tanh(c) -> o_state[0] for the step 1-2 */
+    // 1. tanh(c) -> buf2  --- first copy then activate. 
+    nnom_memcpy(buf2, o_state[1], cell->units*2);
+	local_tanh_q15(buf2, cell->units, 7 - c->q_dec_c); //  this int bit is under 8bit
+    // 2. h = o*tanh(c) -> o_state[0]    ** fill the lower state (memory, hidden)
+    local_mult_q15(z[3], buf2, o_state[0], 15, cell->units);
+
+    // copy and shift q15 to q7 ** (copy hidden to output)
+    local_q15_to_q7(o_state[0], cell->out_data, 8, cell->units);
+	return NN_SUCCESS;
+}
+
+
+// researve for debugging, printing the intermediate products and variables
+#if 0
+static void print_variable(q7_t* data,char*name, int dec_bit, int size)
+{
+	printf("\n");
+	printf("%s\n", name);
+	for(int i = 0; i < size; i++)
+	{
+		if(i%8==0)
+			printf("\n");
+		printf("%f\t", (float) data[i] / (1 << dec_bit));
+	}
+	printf("\n");
+}
+
+static void print_variable_q15(q15_t *data,char*name, int dec_bit, int size)
+{
+	printf("\n\n");
+	printf("%s", name);
+	for(int i = 0; i < size; i++)
+	{
+		if(i%8==0)
+			printf("\n");
+		printf("%f\t", (float) data[i] / (1 << dec_bit));
+	}
+	printf("\n");
+}
+
+
+// Q7 input output 
+// Q7 weights
+// Q15 states and intermediate buffer
+nnom_status_t lstm_cell_q7_q15_run(nnom_rnn_cell_t* cell)
+{
+	nnom_layer_t *layer = cell->layer;
+	nnom_rnn_layer_t* cl = (nnom_rnn_layer_t *) layer;
+	nnom_lstm_cell_t* c = (nnom_lstm_cell_t*) cell;
+    int act_int_bit = 7 - c->q_dec_z;
+	
+				// test
+				//nnom_memset(cell->in_data, 32, cell->feature_size); 
+
+    // state buffer
+    // low |-- hidden --|-- carry --| high
+    q15_t* h_tm1 = (q15_t*)cell->in_state;
+    q15_t* c_tm1 = (q15_t*)cell->in_state + cell->units;
+    q15_t* o_state[2];
+    o_state[0] = (q15_t*)cell->out_state;
+    o_state[1] = (q15_t*)cell->out_state + cell->units;
+
+    // computing buffer
+    // low |-- buf0 --|-- buf1 --|-- buf2 --|-- input q15 --|
+    q15_t* z[4];
+    q15_t *buf0, *buf1, *buf2, *in_q15_buf;
+    buf0 = (q15_t*)layer->comp->mem->blk;
+    buf1 = (q15_t*)layer->comp->mem->blk + cell->units*4;
+    buf2 = (q15_t*)layer->comp->mem->blk + cell->units*8;
+    in_q15_buf = (q15_t*)layer->comp->mem->blk + cell->units*12;
+
+    // input q7 -> q15
+    //local_q7_to_q15_no_shift(cell->in_data, in_q15_buf, cell->feature_size);
+    local_q7_to_q15(cell->in_data, in_q15_buf, cell->feature_size);
+			print_variable_q15(in_q15_buf, "input", layer->in->tensor->q_dec[0] + 8, cell->feature_size);
+			print_variable_q15(h_tm1, "h_tml", 15, cell->units);
+			print_variable_q15(c_tm1, "c_tml", c->q_dec_c + 8, cell->units);
+
+    // z1 = K.dot(cell_inputs, kernel) + bias -> buf1
+	#ifdef NNOM_USING_CMSIS_NN
+		arm_fully_connected_mat_q7_vec_q15_opt
+	#else
+		local_fully_connected_mat_q7_vec_q15_opt
+	#endif 
+		(in_q15_buf, c->weights->p_data, cell->feature_size, cell->units*4, c->bias_shift + 8, c->oshift_iw, c->bias->p_data, buf1, NULL);
+
+    // z2 = K.dot(h_tm1, recurrent_kernel)  -> buf2
+	// arm version must use bias, so we have to use local implementation
+    local_fully_connected_mat_q7_vec_q15_opt(h_tm1, c->recurrent_weights->p_data, 
+            cell->units, cell->units*4, 0, c->oshift_hw, NULL, buf2, NULL); 
+
+    // z = z1 + z2  -> buf0
+    local_add_q15(buf1, buf2, buf0, 0, cell->units*4);
+	
+			print_variable_q15(buf0, "z", c->q_dec_z + 8, cell->units*4);
+			print_variable_q15(buf1, "z1", c->q_dec_z + 8, cell->units*4);
+			print_variable_q15(buf2, "z2", c->q_dec_z + 8, cell->units*4);
+
+    // split the data to each gate
+    z[0] = buf0;
+    z[1] = buf0 + cell->units;
+    z[2] = buf0 + cell->units*2;
+    z[3] = buf0 + cell->units*3;
+
+    // i = nn.sigmoid(z0)
+    local_sigmoid_q15(z[0], cell->units, act_int_bit);
+    // f = nn.sigmoid(z1)
+    local_sigmoid_q15(z[1], cell->units, act_int_bit);
+    // o = nn.sigmoid(z3)
+    local_sigmoid_q15(z[3], cell->units, act_int_bit);
+	
+			print_variable_q15(z[0], "z[0] - i", 15, cell->units);
+			print_variable_q15(z[1], "z[1] - f", 15, cell->units);
+			print_variable_q15(z[3], "z[3] - o", 15, cell->units);
+
+    /* c = f * c_tm1 + i * nn.tanh(z2) for the step 1-3. */
+    // 1. i * tanh(z2) -> buf1
+    local_tanh_q15(z[2], cell->units, act_int_bit);
+			print_variable_q15(z[2], "z[2] - ?", 15, cell->units);
+	
+    local_mult_q15(z[0], z[2], buf1, 30 - (c->q_dec_c+8), cell->units); //q0.15 * q0.15 >> (shift) = (q_c + 8) // i am not very sure
+			print_variable_q15(buf1, "c2: i * tanh(z2) ", c->q_dec_c+8, cell->units);
+
+    // 2. f * c_tm1 -> o_state[0] 
+    local_mult_q15(z[1], c_tm1, o_state[0], 15, cell->units);
+			print_variable_q15(o_state[0], "c1: f * c_tm1", c->q_dec_c+8, cell->units);
+
+    // 3. c = i*tanh + f*c_tm1 -> o_state[1]   ** fill the upper state (carry)
+    local_add_q15(buf1, o_state[0], o_state[1], 0, cell->units);
+			print_variable_q15(o_state[1], "c = c1+c2", c->q_dec_c+8, cell->units);
+
+    /* h = o * nn.tanh(c) -> o_state[0] for the step 1-2 */
+    // 1. tanh(c) -> buf2  --- first copy then activate. 
+    nnom_memcpy(buf2, o_state[1], cell->units*2);
+	local_tanh_q15(buf2, cell->units, 7 - c->q_dec_c); //  this int bit is under 8bit
+			print_variable_q15(buf2, "tanh(c)", 15, cell->units);
+
+    // 2. h = o*tanh(c) -> o_state[0]    ** fill the lower state (memory, hidden)
+    local_mult_q15(z[3], buf2, o_state[0], 15, cell->units);
+			print_variable_q15(o_state[0], "h = o*tanh(c)", 15, cell->units);
+
+    // copy and shift q15 to q7 ** (copy hidden to output)
+    local_q15_to_q7(o_state[0], cell->out_data, 8, cell->units);
+	
+			print_variable(cell->out_data, "q7 output)", 7, cell->units);
+
+	return NN_SUCCESS;
+}
+#endif
--- a/components/ai/nnom/src/layers/nnom_matrix.c
+++ b/components/ai/nnom/src/layers/nnom_matrix.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_matrix.h"
+
+// TODO, completely change this file to local version
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+nnom_status_t matrix_build(nnom_layer_t *layer);
+
+nnom_layer_t *add_s(const nnom_matrix_config_t * config)
+{
+	nnom_matrix_layer_t *cl = (nnom_matrix_layer_t *) Add(config->output_shift);
+	if(cl)
+		cl->super.config = (void*) config;
+	return (nnom_layer_t *)cl;
+}
+
+nnom_layer_t *sub_s(const nnom_matrix_config_t * config)
+{
+	nnom_matrix_layer_t *cl = (nnom_matrix_layer_t *) Sub(config->output_shift);
+	if(cl)
+		cl->super.config = (void*) config;
+	return (nnom_layer_t *)cl;
+}
+
+nnom_layer_t *mult_s(const nnom_matrix_config_t * config)
+{
+	nnom_matrix_layer_t *cl = (nnom_matrix_layer_t *) Mult(config->output_shift);
+	if(cl)
+		cl->super.config = (void*) config;
+	return (nnom_layer_t *)cl;
+}
+
+nnom_layer_t *Add(int16_t oshift)
+{
+	nnom_matrix_layer_t *cl = (nnom_matrix_layer_t *)_same_shape_matrix_layer();
+	if (cl == NULL)
+		return NULL;
+	// set type in layer parent
+	cl->super.type = NNOM_ADD;
+	cl->super.run = add_run;
+	cl->oshift = oshift;
+	return (nnom_layer_t *)cl;
+}
+
+nnom_layer_t *Sub(int16_t oshift)
+{
+	nnom_matrix_layer_t *cl = (nnom_matrix_layer_t *)_same_shape_matrix_layer();
+	if (cl == NULL)
+		return NULL;
+	// set type in layer parent
+	cl->super.type = NNOM_SUB;
+	cl->super.run = sub_run;
+	cl->oshift = oshift;
+	return (nnom_layer_t *)cl;
+}
+
+nnom_layer_t *Mult(int16_t oshift)
+{
+	nnom_matrix_layer_t *cl = (nnom_matrix_layer_t *)_same_shape_matrix_layer();
+	if (cl == NULL)
+		return NULL;
+	// set type in layer parent
+	cl->super.type = NNOM_MULT;
+	cl->super.run = mult_run;
+	cl->oshift = oshift;
+	return (nnom_layer_t *)cl;
+}
+
+// init a base layer instance with same shape 1 in 1 out. More IO can be added later
+// mainly used by matrix calculation (add, mult, sub)
+nnom_layer_t *_same_shape_matrix_layer()
+{
+	nnom_matrix_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+	//nnom_buf_t *comp;
+	size_t mem_size;
+
+	// apply a block memory for all the sub handles.
+	mem_size = sizeof(nnom_matrix_layer_t) + sizeof(nnom_layer_io_t) * 2;
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_matrix_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+	//comp = (void *)((uint8_t*)out + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.build = matrix_build;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	//comp->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	//layer->super.comp = comp;
+	return (nnom_layer_t*)layer;
+}
+
+nnom_status_t matrix_build(nnom_layer_t *layer)
+{
+	// get the last layer's output as input shape (if more than one)
+	nnom_layer_io_t *in = layer->in;
+	while(in)
+	{
+		in->tensor = in->hook.io->tensor;
+		in = in->aux;
+	}
+	// output tensor
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR,layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+	
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+
+	// now this build has passed the input tensors (shapes, formats) to the new tensors. 
+	return NN_SUCCESS;
+}
+
+
+nnom_status_t add_run(nnom_layer_t *layer)
+{
+	nnom_matrix_layer_t* cl = (nnom_matrix_layer_t*)layer;
+	nnom_layer_io_t *in = layer->in;;
+	size_t t_size = tensor_size(layer->out->tensor);
+	int32_t oshift = cl->oshift;
+	size_t num_input = nnom_io_length(layer->in);
+	q7_t *input_mem_blk[MAX_INPUT_LAYER];
+
+	// if there is only 2 matrix
+	if(num_input == 2)
+	{
+		#ifdef NNOM_USING_CMSIS_NN
+		if(oshift == 0)
+			arm_add_q7(layer->in->tensor->p_data, layer->in->aux->tensor->p_data, layer->out->tensor->p_data, t_size);
+		else
+		#endif
+			local_add_q7(layer->in->tensor->p_data, layer->in->aux->tensor->p_data, layer->out->tensor->p_data, oshift, t_size);
+	}
+	else
+	{	
+		for(int i = 0; i < num_input; i++)
+		{
+			input_mem_blk[i] = in->tensor->p_data;
+			in = in->aux;
+		}
+		local_multiple_add_q7(layer->out->tensor->p_data, oshift, t_size, num_input, input_mem_blk);
+	}
+
+	return NN_SUCCESS;
+}
+
+nnom_status_t sub_run(nnom_layer_t *layer)
+{
+	nnom_matrix_layer_t* cl = (nnom_matrix_layer_t*)layer;
+	nnom_layer_io_t *in = layer->in;
+	size_t t_size = tensor_size(layer->out->tensor);
+	int32_t oshift = cl->oshift;
+	size_t num_input = nnom_io_length(layer->in);
+	q7_t *input_mem_blk[MAX_INPUT_LAYER];
+
+	// if there is only 2 matrix
+	if(num_input == 2)
+	{
+		// the first 2 matrix
+		#ifdef NNOM_USING_CMSIS_NN
+		if(oshift == 0)
+			arm_sub_q7(layer->in->tensor->p_data, layer->in->aux->tensor->p_data, layer->out->tensor->p_data, t_size);
+		else
+		#endif
+			local_sub_q7(layer->in->tensor->p_data, layer->in->aux->tensor->p_data, layer->out->tensor->p_data, oshift, t_size);
+	}
+	else
+	{	
+		for(int i = 0; i < num_input; i++)
+		{
+			input_mem_blk[i] = in->tensor->p_data;
+			in = in->aux;
+		}
+		local_multiple_sub_q7(layer->out->tensor->p_data, oshift, t_size, num_input, input_mem_blk);
+	}
+	return NN_SUCCESS;
+}
+
+nnom_status_t mult_run(nnom_layer_t *layer)
+{
+	nnom_matrix_layer_t* cl = (nnom_matrix_layer_t*)layer;
+	nnom_layer_io_t *in = layer->in;
+	size_t t_size = tensor_size(layer->out->tensor);
+	int32_t oshift = cl->oshift;
+	size_t num_input = nnom_io_length(layer->in);
+	q7_t *input_mem_blk[MAX_INPUT_LAYER];
+
+	// if there is only 2 matrix
+	if(num_input == 2)
+	{
+		// the first 2 matrix
+		#ifdef NNOM_USING_CMSIS_NN
+		if(oshift == 0)
+			arm_mult_q7(layer->in->tensor->p_data, layer->in->aux->tensor->p_data, layer->out->tensor->p_data, t_size);
+		else
+		#endif
+			local_mult_q7(layer->in->tensor->p_data, layer->in->aux->tensor->p_data, layer->out->tensor->p_data, oshift, t_size);
+	}
+	else
+	{	
+		for(int i = 0; i < num_input; i++)
+		{
+			input_mem_blk[i] = in->tensor->p_data;
+			in = in->aux;
+		}
+		local_multiple_mult_q7(layer->out->tensor->p_data, oshift, t_size, num_input, input_mem_blk);
+	}
+	return NN_SUCCESS;
+}
--- a/components/ai/nnom/src/layers/nnom_maxpool.c
+++ b/components/ai/nnom/src/layers/nnom_maxpool.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_maxpool.h"
+
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+nnom_layer_t *maxpool_s(const nnom_pool_config_t * config)
+{
+	nnom_layer_t *layer;
+	
+	// test, to accomodate 1d and 2d input
+	if(config->num_dim == 1)
+	{
+		layer = MaxPool(kernel(1, config->kernel_size[0]), 
+						stride(1, config->stride_size[0]), 
+						config->padding_type);
+	}
+	else
+	{
+		layer = MaxPool(kernel(config->kernel_size[0], config->kernel_size[1]), 
+						stride(config->stride_size[0], config->stride_size[1]),
+						config->padding_type);
+	}
+	
+	if(layer)
+		layer->config = (void*) config;
+	return layer;
+}
+
+nnom_layer_t *MaxPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad_type)
+{
+	nnom_maxpool_layer_t *layer;
+	nnom_buf_t *comp;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_maxpool_layer_t) + sizeof(nnom_layer_io_t) * 2 + sizeof(nnom_buf_t);
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_maxpool_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+	comp = (void *)((uint8_t*)out + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_MAXPOOL;
+	layer->super.run = maxpool_run;
+	layer->super.build = maxpool_build;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	comp->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	layer->super.comp = comp;
+
+	// set parameters
+	layer->kernel = k;
+	layer->stride = s;
+	layer->padding_type = pad_type;
+
+	// padding
+	if (layer->padding_type == PADDING_SAME)
+	{
+		layer->pad.h = (k.h - 1) / 2;
+		layer->pad.w = (k.w - 1) / 2;
+		layer->pad.c = 1; // no meaning
+	}
+	else
+	{
+		layer->pad.h = 0;
+		layer->pad.w = 0;
+		layer->pad.c = 0;
+	}
+	return (nnom_layer_t *)layer;
+}
+
+nnom_status_t maxpool_build(nnom_layer_t *layer)
+{
+	nnom_maxpool_layer_t *cl = (nnom_maxpool_layer_t *)layer;
+
+	// get the tensor from last layer's output
+	layer->in->tensor = layer->in->hook.io->tensor;
+
+	// create new tensor for output
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
+	// copy then change later. 
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+	
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+
+	// now we set up the tensor shape, always HWC format
+	if (cl->padding_type == PADDING_SAME)
+	{
+		layer->out->tensor->dim[0] = NN_CEILIF(layer->in->tensor->dim[0], cl->stride.h);
+		layer->out->tensor->dim[1] = NN_CEILIF(layer->in->tensor->dim[1], cl->stride.w);
+		layer->out->tensor->dim[2] = layer->in->tensor->dim[2]; // channel stays the same
+	}
+	else
+	{
+		layer->out->tensor->dim[0] = NN_CEILIF(layer->in->tensor->dim[0] - cl->kernel.h + 1, cl->stride.h);
+		layer->out->tensor->dim[1] = NN_CEILIF(layer->in->tensor->dim[1] - cl->kernel.w + 1, cl->stride.w);
+		layer->out->tensor->dim[2] = layer->in->tensor->dim[2];
+	}
+
+	return NN_SUCCESS;
+}
+
+nnom_status_t maxpool_run(nnom_layer_t *layer)
+{
+	nnom_maxpool_layer_t *cl = (nnom_maxpool_layer_t *)(layer);
+
+	uint16_t out_x, out_y;
+
+	// if global pooling
+	if(layer->out->tensor->num_dim == 1)
+	{
+		out_x = 1; out_y = 1;
+	}
+	else // normal pooling. 
+	{
+		out_x = layer->out->tensor->dim[1]; //W
+		out_y = layer->out->tensor->dim[0]; //h
+	}
+	
+#ifdef NNOM_USING_CHW
+    local_maxpool_q7_CHW(layer->in->tensor->p_data, 				
+			layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+			cl->kernel.w, cl->kernel.h, 
+			cl->pad.w, cl->pad.h,
+			cl->stride.w, cl->stride.h,
+			out_x, out_y,
+			NULL,
+			layer->out->tensor->p_data);
+#else //end of CHW
+	// HWC
+	#ifdef NNOM_USING_CMSIS_NN
+	// 2D, square
+	if (layer->in->tensor->dim[1] == layer->in->tensor->dim[0] &&
+		layer->out->tensor->dim[1] == layer->out->tensor->dim[0])
+	{
+		arm_maxpool_q7_HWC(
+			layer->in->tensor->p_data,
+			layer->in->tensor->dim[1], layer->in->tensor->dim[2],
+			cl->kernel.w, cl->pad.w, cl->stride.w,
+			layer->out->tensor->dim[1],
+			NULL,
+			layer->out->tensor->p_data);
+	}
+	// none square 2D, or 1D
+	else
+	#endif
+	{
+		// CMSIS-NN does not support none-square pooling, we have to use local implementation
+		local_maxpool_q7_HWC(layer->in->tensor->p_data, 				
+				layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+				cl->kernel.w, cl->kernel.h, 
+				cl->pad.w, cl->pad.h,
+				cl->stride.w, cl->stride.h,
+				out_x, out_y,
+				NULL,
+				layer->out->tensor->p_data);
+	}
+#endif // CHW/HWC
+	return NN_SUCCESS;
+}
--- a/components/ai/nnom/src/layers/nnom_output.c
+++ b/components/ai/nnom/src/layers/nnom_output.c
@@ -0,0 +1,54 @@
+
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_output.h"
+
+nnom_layer_t *output_s(const nnom_io_config_t* config)
+{
+	nnom_layer_t *layer = input_s(config);
+	if(layer)
+	{
+		layer->config = (void*) config;
+		layer->type = NNOM_OUTPUT;
+		layer->run = output_run;
+		layer->build = default_build;
+	}
+	return layer;
+}
+
+nnom_layer_t *Output(nnom_3d_shape_t output_shape, void *p_buf)
+{
+	// they are acturally the same.. expect the type defined
+	nnom_layer_t *layer = Input(output_shape, p_buf);
+	if (layer != NULL)
+	{
+		layer->type = NNOM_OUTPUT;
+		layer->run = output_run;
+		layer->build = default_build;
+	}
+	return layer;
+}
+
+nnom_status_t output_run(nnom_layer_t *layer)
+{
+	nnom_io_layer_t *cl = (nnom_io_layer_t *)layer;
+	nnom_memcpy(cl->buf, layer->in->tensor->p_data, tensor_size(layer->out->tensor)); // in->memory -> user memory
+	return NN_SUCCESS;
+}
--- a/components/ai/nnom/src/layers/nnom_rnn.c
+++ b/components/ai/nnom/src/layers/nnom_rnn.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_rnn.h"
+
+nnom_status_t rnn_build(nnom_layer_t *layer);
+nnom_status_t rnn_run(nnom_layer_t *layer);
+nnom_status_t rnn_free(nnom_layer_t* layer);
+
+// RNN
+nnom_layer_t *rnn_s(nnom_rnn_cell_t *cell, const nnom_rnn_config_t* config)
+{
+	nnom_rnn_layer_t *layer;
+	nnom_buf_t *comp;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_rnn_layer_t) + sizeof(nnom_layer_io_t) * 2 + sizeof(nnom_buf_t);
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_rnn_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+	comp = (void *)((uint8_t*)out + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_RNN;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	comp->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	layer->super.comp = comp;
+	// set run and outshape methods
+	layer->super.run = rnn_run;
+	layer->super.build = rnn_build;
+	layer->super.free = rnn_free;
+
+	// rnn parameters.
+	layer->return_sequence = config->return_sequence;
+	layer->stateful = config->stateful;
+	layer->go_backwards = config->go_backwards;
+	layer->super.config = (void*)config;
+	layer->cell = cell;
+
+	// set this layer to the cell
+	layer->cell->layer = (nnom_layer_t *)layer;
+
+	return (nnom_layer_t *)layer;
+}
+
+nnom_status_t rnn_free(nnom_layer_t* layer)
+{
+	nnom_rnn_layer_t* cl = (nnom_rnn_layer_t*)layer;
+	// free cell
+	if(cl->cell->free)
+		cl->cell->free(cl->cell);
+
+	// free state buffer
+	nnom_free(cl->state_buf);
+
+	return NN_SUCCESS;
+}
+
+nnom_status_t rnn_build(nnom_layer_t* layer)
+{
+	nnom_rnn_layer_t *cl = (nnom_rnn_layer_t *)layer;
+
+	// get the tensor from last layer's output
+	layer->in->tensor = layer->in->hook.io->tensor;
+	
+	// timestamp size
+	cl->timestamp_size = layer->in->tensor->num_dim > 2 ? layer->in->tensor->dim[1] : layer->in->tensor->dim[0];
+
+	if(cl->return_sequence)
+	{
+		// create new tensor for the output
+		layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, 2, 0);
+		// shape: timestamp, units
+		layer->out->tensor->dim[0] = cl->timestamp_size;
+		layer->out->tensor->dim[1] = cl->cell->units;
+	}
+	else
+	{
+		// create new tensor for the output
+		layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, 1, 0);
+		// shape: timestamp, units
+		layer->out->tensor->dim[0] = cl->cell->units;
+	}
+
+	// output q format - the output of the available activations are both q0.7.  
+	layer->out->tensor->q_dec[0] = layer->in->tensor->bitwidth==16? 15: 7;
+	layer->out->tensor->bitwidth = layer->in->tensor->bitwidth;
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+
+	// get feature size from input tensor
+	cl->cell->feature_size = tensor_get_num_channel(layer->in->tensor); // vector (feature) size
+
+	// call cell builder to build the cell
+	cl->cell->build(cl->cell);
+
+	// get the size of computation buffer?
+	cl->super.comp->size = cl->cell->comp_buf_size; 	// size of intermediate buffer required by the cell. 
+	cl->state_buf = nnom_mem(cl->cell->state_size * 2); // allocate state buf for upper/lower state buffer. 
+	if(!cl->state_buf)
+		return NN_NO_MEMORY;
+	
+	// get the computational cost provided by Cell
+	layer->stat.macc = cl->cell->macc * cl->timestamp_size;
+	return NN_SUCCESS;
+}
+
+nnom_status_t rnn_run(nnom_layer_t* layer)
+{
+	nnom_status_t result;
+	nnom_rnn_layer_t* cl = (nnom_rnn_layer_t*)(layer);
+	size_t timestamps_size = layer->in->tensor->dim[layer->in->tensor->num_dim-2];
+	size_t feature_size = tensor_get_num_channel(layer->in->tensor); // feature size = last dimension. 
+	size_t state_size = cl->cell->state_size;
+	size_t output_growth;
+	void* upper_state = (q7_t*)cl->state_buf + state_size;
+	void* lower_state = (q7_t*)cl->state_buf;
+
+	// reset state buffer if not in stateful
+	if (!cl->stateful)
+		nnom_memset(cl->state_buf, 0, state_size * 2);
+
+	// set output data
+	output_growth = cl->return_sequence ? cl->cell->units : 0;
+
+	// run timestamp by timestamp
+	for (uint32_t round = 0; round < timestamps_size; round++)
+	{
+		if(cl->go_backwards)
+		{
+			// set input data
+			cl->cell->in_data = (q7_t*)layer->in->tensor->p_data + feature_size*(timestamps_size - 1 - round);
+			// set output data
+			cl->cell->out_data = (q7_t*)layer->out->tensor->p_data + output_growth*(timestamps_size - 1 - round);
+		}
+		else
+		{
+			// set input data
+			cl->cell->in_data = (q7_t*)layer->in->tensor->p_data + feature_size*round;
+			// set output data
+			cl->cell->out_data = (q7_t*)layer->out->tensor->p_data + output_growth*round;
+		}
+		
+		// switch upper/lower state buffer
+		if(cl->cell->in_state != lower_state)
+		{
+			cl->cell->in_state = lower_state;
+			cl->cell->out_state = upper_state;
+		}
+		else
+		{
+			cl->cell->in_state = upper_state;
+			cl->cell->out_state = lower_state;
+		}
+
+		// run it
+		result = cl->cell->run(cl->cell);
+		if(result != NN_SUCCESS)
+			return result;
+	}
+	
+	return NN_SUCCESS;
+}
+
--- a/components/ai/nnom/src/layers/nnom_simple_cell.c
+++ b/components/ai/nnom/src/layers/nnom_simple_cell.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2020-08-21     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_simple_cell.h"
+
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+// Simple RNN
+// unit = output shape
+// type of activation
+nnom_rnn_cell_t *simple_cell_s(const nnom_simple_cell_config_t* config)
+{
+	nnom_simple_cell_t *cell;
+	cell = nnom_mem(sizeof(nnom_simple_cell_t));
+	if (cell == NULL)
+		return NULL;
+	// set methods
+	cell->super.run = simple_cell_run;
+	cell->super.build = simple_cell_build;
+	cell->super.free = simple_cell_free;
+	cell->super.config = (void*) config;
+	cell->super.units = config->units;
+	cell->super.type = NNOM_SIMPLE_CELL;
+
+	// set parameters 
+	cell->bias = config->bias;
+	cell->weights = config->weights;
+	cell->recurrent_weights = config->recurrent_weights;
+	cell->act_type = config->act_type; 
+	// q format for intermediate products
+	cell->q_dec_iw = config->q_dec_iw;
+	cell->q_dec_hw = config->q_dec_hw;
+	cell->q_dec_h = config->q_dec_h;
+	
+	return (nnom_rnn_cell_t *)cell;
+}
+
+nnom_status_t simple_cell_free(nnom_rnn_cell_t* cell)
+{
+	return NN_SUCCESS;
+}
+
+// the state buffer and computational buffer shape of the cell
+nnom_status_t simple_cell_build(nnom_rnn_cell_t* cell)
+{
+	nnom_layer_t *layer = cell->layer; 
+	nnom_simple_cell_t *c = (nnom_simple_cell_t *)cell;
+	nnom_simple_cell_config_t *config = (nnom_simple_cell_config_t *)cell->config;
+	int q_hw_iw;
+	
+	// activation, check if activation is supported 
+	if(config->act_type != ACT_SIGMOID && config->act_type != ACT_TANH)
+		return NN_ARGUMENT_ERROR;
+
+	// calculate output shift for the 2 calculation. 
+	// hw = the product of hidden x weight, iw = the product of input x weight
+	// due to the addition of them, they must have same q format.
+	q_hw_iw = MIN(c->q_dec_hw, c->q_dec_iw);  
+
+	// for the 2 dot in cell: output shift = input_dec + weight_dec - output_dec
+	c->oshift_hw = c->q_dec_h + c->recurrent_weights->q_dec[0] - q_hw_iw;
+	c->oshift_iw = layer->in->tensor->q_dec[0] + c->weights->q_dec[0] - q_hw_iw;
+
+	// bias shift =  bias_dec - out_dec
+	c->bias_shift = layer->in->tensor->q_dec[0] + c->weights->q_dec[0] - c->bias->q_dec[0];
+
+	// state size = one timestamp output size. 
+	cell->state_size = cell->units;
+
+	// comp buffer size: not required
+	cell->comp_buf_size = 0; 
+
+	// finally, calculate the MAC for info
+	cell->macc = cell->feature_size * cell->units // input: feature  * state
+				+ cell->units * cell->units;      // recurrent, state *  output_unit
+
+	return NN_SUCCESS;
+}
+
+// This Simple Cell replicate the Keras's SimpleCell as blow 
+/*
+ def call(self, inputs, states, training=None):
+    prev_output = states[0] if nest.is_sequence(states) else states
+
+	h = K.dot(inputs, self.kernel)
+	h = K.bias_add(h, self.bias)
+
+	h2 = K.dot(prev_output, self.recurrent_kernel)
+    output = h + H2
+    output = self.activation(output)
+
+    new_state = [output] if nest.is_sequence(states) else output
+    return output, new_state
+*/
+
+nnom_status_t simple_cell_run(nnom_rnn_cell_t* cell)
+{
+	nnom_simple_cell_t* c = (nnom_simple_cell_t*) cell;
+	int act_int_bit = 7 - MIN(c->q_dec_hw, c->q_dec_iw);
+
+	// in_state x recurrent_weight -> h2 (output buf)
+	local_dot_q7_opt(cell->in_state, c->recurrent_weights->p_data, cell->units, cell->units, c->oshift_hw, cell->out_data);
+	// (input x weight) + bias -> h (in_state buf)
+	local_fully_connected_q7_opt(cell->in_data, c->weights->p_data, 
+				cell->feature_size, cell->units, c->bias_shift, c->oshift_iw, c->bias->p_data, cell->in_state, NULL);
+	// h + h2 -> (out_state buf)
+	local_add_q7(cell->in_state, cell->out_data, cell->out_state, 0, cell->units);
+
+	// active(out_state buf)
+	if(c->act_type == ACT_TANH)
+		local_tanh_q7(cell->out_state, cell->units, act_int_bit);
+		//local_hard_tanh_q7(cell->out_state, cell->units, act_int_bit);
+	else
+		local_sigmoid_q7(cell->out_state, cell->units, act_int_bit);
+		//local_hard_sigmoid_q7(cell->out_state, cell->units, act_int_bit);
+
+	// (out_state buf) --copy--> (output buf)
+	nnom_memcpy(cell->out_data, cell->out_state, cell->units);
+
+	return NN_SUCCESS;
+}
+
+
--- a/components/ai/nnom/src/layers/nnom_softmax.c
+++ b/components/ai/nnom/src/layers/nnom_softmax.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_softmax.h"
+
+#ifdef NNOM_USING_CMSIS_NN
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+#endif
+
+nnom_layer_t *softmax_s(const nnom_softmax_config_t * config)
+{
+	nnom_layer_t * layer = Softmax();
+	if(layer)
+		layer->config = (void*) config;
+	return layer;
+}
+
+nnom_layer_t *Softmax(void)
+{
+	nnom_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_layer_t) + sizeof(nnom_layer_io_t) * 2;
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->type = NNOM_SOFTMAX;
+	layer->run = softmax_run;
+	layer->build = softmax_build;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->in = io_init(layer, in);
+	layer->out = io_init(layer, out);
+
+	return layer;
+}
+
+nnom_status_t softmax_build(nnom_layer_t *layer)
+{
+	// get the last layer's output as input shape
+	layer->in->tensor = layer->in->hook.io->tensor;
+	// output tensor
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+	// softmax has fixed output dec bit
+	layer->out->tensor->q_dec[0] = 7;
+	return NN_SUCCESS;
+}
+
+nnom_status_t softmax_run(nnom_layer_t *layer)
+{
+	// looks like the new version cause accuracy drop quite a lot. 
+//	#ifdef NNOM_USING_CMSIS_NN
+//	// temporary fixed for mutiple dimension input. 
+//	arm_softmax_q7(layer->in->tensor->p_data, tensor_size(layer->out->tensor), layer->out->tensor->p_data);
+//	#else
+	local_softmax_q7(layer->in->tensor->p_data, tensor_size(layer->out->tensor), layer->out->tensor->p_data);
+	//#endif
+	return NN_SUCCESS;
+}
--- a/components/ai/nnom/src/layers/nnom_sumpool.c
+++ b/components/ai/nnom/src/layers/nnom_sumpool.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_sumpool.h"
+
+nnom_layer_t *sumpool_s(const nnom_pool_config_t * config)
+{
+	nnom_sumpool_layer_t *cl;
+	if(config->num_dim == 1)
+	{
+		cl = (nnom_sumpool_layer_t *)SumPool(kernel(1, config->kernel_size[0]), 
+						stride(1, config->stride_size[0]),
+						config->padding_type);
+	}
+	else
+	{
+		cl = (nnom_sumpool_layer_t *)SumPool(kernel(config->kernel_size[0], config->kernel_size[1]), 
+						stride(config->stride_size[0], config->stride_size[1]),
+						config->padding_type);
+	}
+	if(cl)
+	{
+		cl->super.config = (void*) config;
+		cl->output_shift = config->output_shift; // no idea if we need it
+	}
+	return (nnom_layer_t *)cl;
+}
+
+
+nnom_layer_t *SumPool(nnom_3d_shape_t k, nnom_3d_shape_t s, nnom_padding_t pad_type)
+{
+	nnom_layer_t *layer = MaxPool(k, s, pad_type);
+
+	if (layer != NULL)
+	{
+		layer->type = NNOM_SUMPOOL;
+		layer->run = sumpool_run;
+		layer->build = sumpool_build;
+	}
+	return (nnom_layer_t *)layer;
+}
+
+
+nnom_status_t sumpool_build(nnom_layer_t *layer)
+{
+	// avg pooling share the same output shape, stride, padding setting.
+	maxpool_build(layer);
+
+	// however, avg pooling require a computational buffer.
+	layer->comp->size = 4 * tensor_size(layer->out->tensor);
+
+	return NN_SUCCESS;
+}
+
+
+// sum pooling, dynamic change Q format, must be used in the last layer before softmax in current version
+nnom_status_t sumpool_run(nnom_layer_t *layer)
+{
+	nnom_sumpool_layer_t *cl = (nnom_sumpool_layer_t *)(layer);
+	uint16_t out_x, out_y;
+
+	// if global pooling
+	if(layer->out->tensor->num_dim == 1)
+	{
+		out_x = 1; out_y = 1;
+	}
+	else // normal pooling. 
+	{
+		out_x = layer->out->tensor->dim[1]; //W
+		out_y = layer->out->tensor->dim[0]; //h
+	}
+	
+#ifdef NNOM_USING_CHW
+	local_sumpool_q7_CHW(				
+#else
+	local_sumpool_q7_HWC(
+#endif
+			layer->in->tensor->p_data, 				
+			layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+			cl->kernel.w, cl->kernel.h, 
+			cl->pad.w, cl->pad.h,
+			cl->stride.w, cl->stride.h,
+			out_x, out_y,
+			layer->comp->mem->blk,
+			layer->out->tensor->p_data);
+
+	return NN_SUCCESS;
+}
--- a/components/ai/nnom/src/layers/nnom_upsample.c
+++ b/components/ai/nnom/src/layers/nnom_upsample.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_upsample.h"
+
+nnom_layer_t *upsample_s(const nnom_upsample_config_t *config)
+{
+	nnom_layer_t *layer = UpSample(kernel(config->kernel[0], config->kernel[1]));
+	if(layer)
+		layer->config = (void*) config;
+	return layer;
+}
+
+// up sampling layer
+nnom_layer_t *UpSample(nnom_3d_shape_t kernel)
+{
+	nnom_upsample_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_upsample_layer_t) + sizeof(nnom_layer_io_t) * 2;
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_upsample_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_UPSAMPLE;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	// set run and outshape methods
+	layer->super.run = upsample_run;
+	layer->super.build = upsample_build;
+
+	// set parameters
+	layer->kernel = kernel;
+	
+	return (nnom_layer_t*)layer;
+}
+
+nnom_status_t upsample_build(nnom_layer_t *layer)
+{
+	nnom_upsample_layer_t* cl = (nnom_upsample_layer_t*)layer;
+
+	// get the last layer's output as input shape
+	layer->in->tensor = layer->in->hook.io->tensor;
+	// output tensor
+	// 1. allocate a new tensor for output
+	// 2. set the same dim, qfmt to the new tensor.
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+
+	// enlarge w and h, c stay the same.
+	layer->out->tensor->dim[0] = layer->in->tensor->dim[0] * cl->kernel.h;
+	layer->out->tensor->dim[1] = layer->in->tensor->dim[1] * cl->kernel.w;
+
+	return NN_SUCCESS;
+}
+
+// up sampling, or so called unpooling
+nnom_status_t upsample_run(nnom_layer_t *layer)
+{
+	nnom_upsample_layer_t *cl = (nnom_upsample_layer_t *)(layer);
+
+#ifdef NNOM_USING_CHW
+	local_up_sampling_q7_CHW(				
+#else
+	local_up_sampling_q7_HWC(
+#endif
+			layer->in->tensor->p_data, 				
+			layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+			cl->kernel.w, cl->kernel.h, 
+			layer->out->tensor->dim[1], layer->out->tensor->dim[0],
+			NULL,
+			layer->out->tensor->p_data);
+	return NN_SUCCESS;
+}
--- a/components/ai/nnom/src/layers/nnom_zero_padding.c
+++ b/components/ai/nnom/src/layers/nnom_zero_padding.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018-2020
+ * Jianjia Ma
+ * majianjia@live.com
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2019-07-23     Jianjia Ma   The first version
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "nnom.h"
+#include "nnom_local.h"
+#include "nnom_layers.h"
+#include "layers/nnom_zero_padding.h"
+
+nnom_layer_t * zeropadding_s(const nnom_zero_padding_config_t* config)
+{
+	nnom_layer_t *layer = ZeroPadding(config->pad);
+	if(layer)
+		layer->config = (void*) config;
+	return (nnom_layer_t*)layer;
+}
+
+// Zero padding layer
+nnom_layer_t *ZeroPadding(nnom_border_t pad)
+{
+	nnom_zero_padding_layer_t *layer;
+	nnom_layer_io_t *in, *out;
+
+	// apply a block memory for all the sub handles.
+	size_t mem_size = sizeof(nnom_zero_padding_layer_t) + sizeof(nnom_layer_io_t) * 2;
+	layer = nnom_mem(mem_size);
+	if (layer == NULL)
+		return NULL;
+
+	// distribut the memory to sub handles.
+	in = (void *)((uint8_t*)layer + sizeof(nnom_zero_padding_layer_t));
+	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));
+
+	// set type in layer parent
+	layer->super.type = NNOM_ZERO_PADDING;
+	// set buf state
+	in->type = NNOM_TENSOR_BUF_TEMP;
+	out->type = NNOM_TENSOR_BUF_TEMP;
+	// put in & out on the layer.
+	layer->super.in = io_init(layer, in);
+	layer->super.out = io_init(layer, out);
+	// set run and outshape methods
+	layer->super.run = zero_padding_run;
+	layer->super.build = zero_padding_build;
+
+	// set parameters
+	layer->pad = pad;
+	
+	return (nnom_layer_t*)layer;
+}
+
+nnom_status_t zero_padding_build(nnom_layer_t* layer)
+{
+	nnom_zero_padding_layer_t *cl = (nnom_zero_padding_layer_t *)layer;
+
+	// get the tensor from last layer's output
+	layer->in->tensor = layer->in->hook.io->tensor;
+
+	// create new tensor for output
+	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
+	// copy then change later. 
+	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
+
+	// see if the activation will change the q format
+	if(layer->actail) 
+		layer->out->tensor->q_dec[0] = act_get_dec_bit(layer->actail->type, layer->out->tensor->q_dec[0]);
+
+	// output shape
+	layer->out->tensor->dim[1] = layer->in->tensor->dim[1] + cl->pad.left + cl->pad.right;
+	layer->out->tensor->dim[0] = layer->in->tensor->dim[0] + cl->pad.top + cl->pad.bottom;
+	layer->out->tensor->dim[2] = layer->in->tensor->dim[2];
+	return NN_SUCCESS;
+}
+
+nnom_status_t zero_padding_run(nnom_layer_t * layer)
+{
+	nnom_zero_padding_layer_t *cl = (nnom_zero_padding_layer_t*)layer;
+	
+#ifdef NNOM_USING_CHW
+	local_zero_padding_CHW_q7(
+#else
+	local_zero_padding_HWC_q7(
+#endif
+						layer->in->tensor->p_data, 
+						layer->in->tensor->dim[1], layer->in->tensor->dim[0], layer->in->tensor->dim[2],
+						cl->pad.top,
+						cl->pad.bottom,
+						cl->pad.left,
+						cl->pad.right,
+						layer->out->tensor->p_data,
+						layer->out->tensor->dim[1], layer->out->tensor->dim[0]);
+
+	return NN_SUCCESS;
+}
+
--- a/components/ai/onnx/README.md
+++ b/components/ai/onnx/README.md
@@ -1,95 +0,0 @@
-![](https://raw.githubusercontent.com/onnx/onnx/master/docs/ONNX_logo_main.png)
-
-# ONNX
-
-**通用神经网络模型推理框架 onnx 在 TencentOS-tiny 上的移植**
-
-[ONNX](https://onnx.ai/) (Open Neural Network Exchange) 是机器学习模型的通用格式，可以融合不同机器学习框架的模型。
-
-ONNX是一个用于表示深度学习模型的标准，可使模型在不同框架之间进行转移。Tensorflow, Keras, Pytorch, Caffe2, mxnet等知名深度学习框架训练的模型可以转化为onnx格式，于是便可以在RTOS上运行。
-
-
-## 支持算子
-
- Conv2D
- Relu
- Maxpool
- Softmax
- Matmul
- Add
- Flatten
- Transpose
-
-## mnist例程
-
-当前有两个手写体识别的例程：
-
-mnist_int 和 mnist_float
-
-分别适用用于整形推理和浮点推理，根据平台不同而定，其中mnist_int在imx6ull上通过验证，mnist_float在stm32L4上通过验证。最小的 demo 只需要 16KB 内存，因此在STM32F103C8T6 上也可以运行，其中mnist_int相当于是做了int32的量化，针对归一化后的浮点double型参数乘以1000倍，并修改了softmax层的算子，最后可以完成速度更快的整形数推理：
-
-| 例程文件      | 说明                                     |
-| ------------- | ---------------------------------------- |
-| mnist_int.c       | 整形推理，模型参数保存在 mnist_int.h   |
-| mnist_float.c    | 浮点推理，模型参数保存在mnist_float.h |
-
-####  模型结构
-
-```
-_________________________________________________________________
-Layer (type)                 Output Shape              Param #   
-=================================================================
-conv2d_5 (Conv2D)            (None, 28, 28, 2)         20        
-_________________________________________________________________
-max_pooling2d_5 (MaxPooling2 (None, 14, 14, 2)         0         
-_________________________________________________________________
-dropout_5 (Dropout)          (None, 14, 14, 2)         0         
-_________________________________________________________________
-conv2d_6 (Conv2D)            (None, 14, 14, 2)         38        
-_________________________________________________________________
-max_pooling2d_6 (MaxPooling2 (None, 7, 7, 2)           0         
-_________________________________________________________________
-dropout_6 (Dropout)          (None, 7, 7, 2)           0         
-_________________________________________________________________
-flatten_3 (Flatten)          (None, 98)                0         
-_________________________________________________________________
-dense_5 (Dense)              (None, 4)                 396       
-_________________________________________________________________
-dense_6 (Dense)              (None, 10)                50        
-=================================================================
-Total params: 504
-Trainable params: 504
-Non-trainable params: 0
-_________________________________________________________________
-
-```
-推理测试
-
-![mnist_test](pic/mnist_test.png)
-
-
-
-## 注意事项
-
-由于 onnx 的模型是 Google Protobuf v3 的格式，所以在protobuf文件夹中也包含了模型解析部分，可以配合文件系统进行模型读取。
-
- protobuf-c
- onnx-pb-c
-
-在 platform 中是对于不同平台的特殊适配，比如malloc、free的实现等
-
- tencentos_libc_malloc
-
-
-## Todo List
-
- 解析更加复杂的模型
- 针对不同硬件适配加速算子
-
-## 参考
-https://github.com/wuhanstudio/onnx-backend
-##  联系方式
-
- 维护：derek
- 邮箱：dkeji627@gmail.com
-
--- a/components/ai/onnx/examples/mnist_int/mnist.c
+++ b/components/ai/onnx/examples/mnist_int/mnist.c
@@ -1,191 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "mnist_int.h"
-#include "onnx.h"
-
-static const char codeLib[] = "@B%8&WM#*oahkbdpqwmZO0QLCJUYXzcvunxrjft/\\|()1{}[]?-_+~<>i!lI;:,\"^`'.   ";
-int data[5]={1.5 , 2.5 , 3.5 , 4.5 , 5.5};
-static const int img[2][784] = {IMG0, IMG1};
-
-static const int img1[784] = {1,2,3,4,5};
-
-int hello()
-{
-    printf("hello pnnx\r\n");
-    return 0;
-}
-
-void print_img(void * buf)
-{
-    int index = 0;
-    //char ch = '@';
-    int x = 0;
-    int y = 0;
-    
-    printf("test2\r\n");
-
-    for(y = 0; y < 28; y++) 
-    {
-        for (x = 0; x < 28; x++) 
-        {
-            
-            index = 0;
-            if(((int*)buf)[y*28+x] > 600)
-            {
-                index =69;       
-            }
-            if(index < 0)
-            {
-                index = 0;
-            }
-            
-            
-            printf("%c",codeLib[index]);
-            printf("%c",codeLib[index]);
-        }
-        printf("\r\n");
-    }
-}
-
-int mnist()
-{
-    printf("test1\r\n");
-    int img_index = 1;
-    print_img(img[img_index]);
-
-    printf("img ok\r\n");
-
-    // 1. Conv2D
-    int64_t shapeW3[] = {2, 1, 3, 3};
-    int64_t dimW3 = 4;
-    int64_t permW3_t[] = { 0, 2, 3, 1};
-    int* W3_t = transpose(W3, shapeW3, dimW3, permW3_t);
-
-    printf("transpose ok\r\n");
-
-    int* conv1 = (int*) malloc(sizeof(int)*28*28*2);
-    memset(conv1, 0, sizeof(sizeof(int)*28*28*2));
-
-    conv2D(img[img_index], 28, 28, 1, W3, 2, 3, 3, 1, 1, 1, 1, B3, conv1, 28, 28);
-    
-    free(W3_t);
-
-    printf("Conv2D ok \r\n");
-
-    // 2. Relu
-    int* relu1 = (int*) malloc(sizeof(int)*28*28*2);
-    relu(conv1, 28*28*2, relu1);
-
-    free(conv1);
-
-    printf("Relu ok\r\n");
-
-    // 3. Maxpool
-    int* maxpool1 = (int*) malloc(sizeof(int)*14*14*2);
-    memset(maxpool1, 0, sizeof(sizeof(int)*14*14*2));
-    maxpool(relu1, 28, 28, 2, 2, 2, 0, 0, 2, 2, 14, 14, maxpool1);
-
-    free(relu1);
-
-    printf("Maxpool ok\r\n");
-
-    // 4. Conv2D
-    int64_t shapeW2[] = {2, 2, 3, 3};
-    int64_t dimW2 = 4;
-    int64_t perm_t[] = { 0, 2, 3, 1};
-    int* W2_t = transpose(W2, shapeW2, dimW2, perm_t);
-
-    int* conv2 = (int*) malloc(sizeof(int)*14*14*2);
-    memset(conv2, 0, sizeof(sizeof(int)*14*14*2));
-    conv2D(maxpool1, 14, 14, 2, W2_t, 2, 3, 3, 1, 1, 1, 1, B2, conv2, 14, 14);
-
-    free(W2_t);
-    free(maxpool1);
-
-    printf("Conv2D ok\r\n");
-
-    // 5. Relu
-    int* relu2 = (int*) malloc(sizeof(int)*14*14*2);
-    relu(conv2, 14*14*2, relu2);
-
-    free(conv2);
-
-    printf("Relu ok\r\n");
-
-    // 6. Maxpool
-    int* maxpool2 = (int*) malloc(sizeof(int)*7*7*2);
-    memset(maxpool2, 0, sizeof(sizeof(int)*7*7*2));
-    maxpool(relu2, 14, 14, 2, 2, 2, 0, 0, 2, 2, 7, 7, maxpool2);
-
-    free(relu2);
-
-    printf("Maxpool ok\r\n");
-
-    // Flatten NOT REQUIRED
-
-    // 7. Dense
-    int64_t shapeW1[] = {98, 4};
-    int64_t dimW1 = 2;
-    int64_t permW1_t[] = { 1, 0};
-    int* W1_t = transpose(W1, shapeW1, dimW1, permW1_t);
-
-    int* dense1 = (int*) malloc(sizeof(int)*4);
-    memset(dense1, 0, sizeof(sizeof(int)*4));
-    dense(maxpool2, W1_t, 98, 4, B1, dense1);
-
-    free(W1_t);
-    free(maxpool2);
-
-    printf("Dense ok\r\n");
-
-    // 8. Dense
-    int64_t shapeW[] = {4, 10};
-    int64_t dimW = 2;
-    int64_t permW_t[] = { 1, 0};
-    int* W_t = transpose(W, shapeW, dimW, permW_t);
-
-    int* dense2 = (int*) malloc(sizeof(int)*10);
-    memset(dense2, 0, sizeof(sizeof(int)*10));
-    dense(dense1, W_t, 4, 10, B, dense2);
-
-    free(W_t);
-    free(dense1);
-
-    printf("Dense ok\r\n");
-
-    // 9. Softmax
-    int* output = (int*) malloc(sizeof(int)*10);
-    memset(output, 0, sizeof(sizeof(int)*10));
-    softmax(dense2, 10, output);
-
-    printf("Softmax ok\r\n");
-    
-    int max = 0;
-    int min = output[0];
-    int max_index = 0;
-    int min_index = 0;
-    printf("\n\rPredictions: \n\r");
-    for(int i = 0; i < 10; i++)
-    {
-        printf("%d ", output[i]);
-        if(output[i] > max)
-        {
-            max = output[i];
-            max_index = i;
-        }
-        if(output[i] < min)
-        {
-            min = output[i];
-            min_index = i;
-        }
-    }
-    printf("\n\r");
-    printf("\n\rThe number is %d\n\r", min_index);
-
-    free(dense2);
-    free(output);
-
-    printf("Result ok\r\n");
-
-    return 0;
-}
--- a/components/ai/onnx/examples/mnist_int/mnist_int.h
+++ b/components/ai/onnx/examples/mnist_int/mnist_int.h
@@ -1,73 +0,0 @@
-#ifndef __MNIST_INT_H__
-#define __MNIST_INT_H__
-
-#include <stdio.h>
-#include <stdint.h>
-
-
-#define IMG0 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,380,376,301,462,239,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,352,541,921,921,921,921,921,921,984,984,972,996,960,921,745,82,0,0,0,0,0,0,0,0,0,0,0,549,984,996,996,996,996,996,996,996,996,996,996,996,996,996,996,741,90,0,0,0,0,0,0,0,0,0,0,886,996,815,780,780,780,780,545,239,239,239,239,239,501,870,996,996,741,82,0,0,0,0,0,0,0,0,0,149,321,50,0,0,0,0,0,0,0,0,0,0,0,133,835,996,996,450,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,329,996,996,917,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,329,996,996,917,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,415,615,996,996,952,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,98,458,894,894,894,992,996,996,996,996,941,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,266,466,862,996,996,996,996,996,996,996,996,996,556,0,0,0,0,0,0,0,0,0,0,0,0,0,145,733,992,996,996,996,874,807,807,294,266,843,996,996,458,0,0,0,0,0,0,0,0,0,0,0,0,443,858,996,949,890,450,349,121,0,0,0,0,784,996,945,160,0,0,0,0,0,0,0,0,0,0,0,0,662,996,690,243,0,0,0,0,0,0,0,188,905,996,917,0,0,0,0,0,0,0,0,0,0,0,0,0,70,486,0,0,0,0,0,0,0,0,0,329,996,996,650,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,545,996,933,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,823,980,996,658,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,949,996,937,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,349,984,945,337,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,19,807,964,615,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,458,270,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
-#define IMG0_LABEL 7
-
-
-
-#define IMG1 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,121,517,996,992,996,835,321,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,82,556,913,988,992,988,992,988,874,78,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,482,996,992,996,992,878,796,796,874,1000,835,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,796,992,988,992,831,78,0,0,239,992,988,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,160,952,878,796,717,160,596,117,0,0,1000,992,400,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,156,78,0,0,400,992,196,0,321,992,988,78,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,321,839,121,443,913,996,913,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,243,400,321,160,992,909,992,988,913,196,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,596,992,996,992,996,992,996,913,482,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,596,988,992,988,992,988,752,196,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,243,717,796,952,996,992,243,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,156,674,988,796,78,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,82,0,0,0,0,0,0,0,0,0,717,996,439,0,0,0,0,0,0,0,0,0,0,0,0,0,0,243,796,639,0,0,0,0,0,0,0,0,239,992,592,0,0,0,0,0,0,0,0,0,0,0,0,0,82,839,752,0,0,0,0,0,0,0,0,43,835,996,592,0,0,0,0,0,0,0,0,0,0,0,0,0,400,992,592,0,0,0,0,0,0,0,160,835,988,992,435,0,0,0,0,0,0,0,0,0,0,0,0,0,160,1000,835,360,200,0,0,121,360,678,992,996,992,556,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,674,988,992,988,796,796,913,988,992,988,992,509,78,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,82,796,1000,992,996,992,996,992,956,796,321,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,78,592,592,992,670,592,592,156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} 
-#define IMG1_LABEL 3
-
-#define TOTAL_IMAGE 2
-
-
-static const signed char label[] = {IMG0_LABEL, IMG1_LABEL};
-
-
-
-static const int W3[] = {-323,-426,-651,790,-221,37,398,221,797,254,307,625,-589,203,-64,-1566,-376,-644};
-static const int B3[] = {-829,-140};
-
-static const int W2[] = {7,231,36,-146,-155,4,273,-27,234,-635,-556,-770,156,710,239,1820,-18,1574,1723,-596,1399,335,568,379,35,-182,-32,6,-2,-5,293,137,355,2,2,-22};
-static const int B2[] = {-116,-3};
-
-static const int W1[] = {157,-226,21,25,8,-775,-415,-125,-396,335,-631,-28,-506,-357,-3780,-826,102,571,-625,66,559,253,-3075,-695,253,317,-866,127,831,266,-2586,-572,297,162,-991,77,891,168,-2524,-563,416,-108,-1022,206,398,-160,-1918,-483,57,-1257,-231,1051,-798,-1626,-260,-76,-464,755,131,247,-1527,163,-75,-58,-338,1305,144,440,-310,154,5,-31,-159,661,83,265,-38,180,-7,54,-14,306,6,223,30,126,-28,111,35,46,-26,264,69,107,-30,95,248,-364,-102,496,40,20,-54,54,-71,-1538,-235,1589,-23,-249,18,80,51,614,157,128,-869,1376,430,134,-149,454,130,231,3,427,233,92,-60,464,103,250,-53,214,116,224,126,234,127,332,14,106,108,305,314,-71,134,454,54,74,97,274,486,-436,-135,572,135,-7,118,244,-375,-468,-564,865,340,-172,40,363,89,-498,476,21,285,617,705,-306,-570,-206,41,230,-179,-23,141,23,-641,-69,-85,164,-534,-101,-131,149,-635,-98,-232,154,-485,-190,-204,106,-529,-173,-362,122,-386,-247,-252,102,-145,-101,43,-171,-31,-301,-94,69,-549,668,145,-737,770,-412,101,52,254,227,-30,-83,-663,512,-121,-334,-75,-98,-16,-31,-435,94,-49,-77,-128,-89,-70,-10,-290,-13,-39,-23,-155,-52,-147,-75,-268,-35,-95,-15,-39,24,-196,-199,-203,-42,-187,-45,-10,148,-117,-418,-206,-24,-157,-55,-90,402,-357,-786,-79,162,-144,-274,268,688,-64,113,599,1294,-1250,608,123,158,-175,34,391,231,-756,200,79,14,-121,8,268,57,-526,124,80,-38,-88,0,286,-10,-393,111,65,-33,-74,-27,300,2,-479,-45,-10,39,-92,-192,154,212,-389,-373,-206,292,-129,-360,-554,457,-352,-947,-1248,887,336,3,-227,1456,549,383,-411,375,176,38,163,705,55,644,-207,146,103,197,174,365,-97,522,-184,-1,88,241,155,172,-105,382,-306,-162,115,307,158,-17,-50,262,-1299,-227,108,744,-398,16,100,-163,-649,-567,17,989,-1395,441,166,-191};
-static const int B1[] = {1201,-1177,2169,-1961};
-
-static const int W[] = {558,787,-40,-122,-412,-36,169,-147,-16,-280,18,62,495,339,-475,-140,-882,183,20,-137,-52,679,-280,-312,444,-261,-322,1032,-144,522,57,-965,-305,168,-532,426,-543,14,267,159};
-static const int B[] = {41,1461,71,-1277,809,-1693,-297,-117,329,659};
-
-
-
-/*
-
-#define IMG0 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3803,3764,3019,4627,2392,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3529,5411,9215,9215,9215,9215,9215,9215,9843,9843,9725,9960,9607,9215,7450,823,0,0,0,0,0,0,0,0,0,0,0,5490,9843,9960,9960,9960,9960,9960,9960,9960,9960,9960,9960,9960,9960,9960,9960,7411,901,0,0,0,0,0,0,0,0,0,0,8862,9960,8156,7803,7803,7803,7803,5450,2392,2392,2392,2392,2392,5019,8705,9960,9960,7411,823,0,0,0,0,0,0,0,0,0,1490,3215,509,0,0,0,0,0,0,0,0,0,0,0,1333,8352,9960,9960,4509,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3294,9960,9960,9176,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3294,9960,9960,9176,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4156,6156,9960,9960,9529,2000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,980,4588,8941,8941,8941,9921,9960,9960,9960,9960,9411,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2666,4666,8627,9960,9960,9960,9960,9960,9960,9960,9960,9960,5568,0,0,0,0,0,0,0,0,0,0,0,0,0,1450,7333,9921,9960,9960,9960,8745,8078,8078,2941,2666,8431,9960,9960,4588,0,0,0,0,0,0,0,0,0,0,0,0,4431,8588,9960,9490,8901,4509,3490,1215,0,0,0,0,7843,9960,9450,1607,0,0,0,0,0,0,0,0,0,0,0,0,6627,9960,6901,2431,0,0,0,0,0,0,0,1882,9058,9960,9176,0,0,0,0,0,0,0,0,0,0,0,0,0,705,4862,0,0,0,0,0,0,0,0,0,3294,9960,9960,6509,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5450,9960,9333,2235,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8235,9803,9960,6588,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9490,9960,9372,2235,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3490,9843,9450,3372,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,196,8078,9647,6156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,156,4588,2705,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
-#define IMG0_LABEL 7
-
-
-
-#define IMG1 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1215,5176,9960,9921,9960,8352,3215,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,823,5568,9137,9882,9921,9882,9921,9882,8745,784,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4823,9960,9921,9960,9921,8784,7960,7960,8745,10000,8352,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7960,9921,9882,9921,8313,784,0,0,2392,9921,9882,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1607,9529,8784,7960,7176,1607,5960,1176,0,0,10000,9921,4000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1568,784,0,0,4000,9921,1960,0,3215,9921,9882,784,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3215,8392,1215,4431,9137,9960,9137,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2431,4000,3215,1607,9921,9098,9921,9882,9137,1960,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5960,9921,9960,9921,9960,9921,9960,9137,4823,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5960,9882,9921,9882,9921,9882,7529,1960,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2431,7176,7960,9529,9960,9921,2431,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1568,6745,9882,7960,784,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,823,0,0,0,0,0,0,0,0,0,7176,9960,4392,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2431,7960,6392,0,0,0,0,0,0,0,0,2392,9921,5921,0,0,0,0,0,0,0,0,0,0,0,0,0,823,8392,7529,0,0,0,0,0,0,0,0,431,8352,9960,5921,0,0,0,0,0,0,0,0,0,0,0,0,0,4000,9921,5921,0,0,0,0,0,0,0,1607,8352,9882,9921,4352,0,0,0,0,0,0,0,0,0,0,0,0,0,1607,10000,8352,3607,2000,0,0,1215,3607,6784,9921,9960,9921,5568,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6745,9882,9921,9882,7960,7960,9137,9882,9921,9882,9921,5098,784,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,823,7960,10000,9921,9960,9921,9960,9921,9568,7960,3215,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,784,5921,5921,9921,6705,5921,5921,1568,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} 
-#define IMG1_LABEL 3
-
-#define TOTAL_IMAGE 2
-
-
-static const signed char label[] = {IMG0_LABEL, IMG1_LABEL};
-
-
-
-static const int W3[] = {-3233,-4261,-6519,7906,-2210,371,3984,2212,7975,2549,3076,6250,-5895,2037,-647,-15660,-3767,-6443};
-static const int B3[] = {-8293,-1409};
-
-static const int W2[] = {70,2319,368,-1468,-1559,44,2732,-275,2340,-6354,-5564,-7705,1560,7101,2395,18201,-183,15745,17230,-5966,13997,3351,5684,3797,350,-1828,-322,69,-26,-57,2935,1379,3558,22,25,-226};
-static const int B2[] = {-1165,-36};
-
-static const int W1[] = {1579,-2264,212,255,87,-7751,-4159,-1258,-3963,3354,-6319,-287,-5066,-3574,-37807,-8261,1022,5711,-6256,669,5596,2537,-30759,-6959,2531,3173,-8664,1275,8313,2666,-25865,-5720,2974,1623,-9915,779,8913,1685,-25247,-5639,4167,-1080,-10229,2062,3988,-1602,-19185,-4837,573,-12573,-2311,10518,-7981,-16263,-2600,-764,-4646,7558,1318,2474,-15276,1636,-754,-585,-3385,13052,1444,4408,-3103,1541,53,-317,-1599,6612,832,2651,-384,1805,-73,541,-142,3065,61,2231,303,1269,-281,1118,353,468,-265,2645,699,1071,-303,952,2480,-3649,-1027,4960,400,209,-547,541,-718,-15381,-2356,15890,-230,-2493,187,804,519,6141,1578,1288,-8691,13761,4305,1347,-1497,4542,1307,2311,36,4274,2339,920,-602,4642,1039,2504,-532,2146,1169,2240,1263,2349,1277,3324,140,1063,1087,3052,3141,-716,1348,4541,546,745,973,2748,4866,-4363,-1358,5724,1359,-74,1185,2448,-3753,-4687,-5648,8657,3407,-1721,406,3630,895,-4989,4768,217,2856,6174,7059,-3063,-5705,-2069,419,2304,-1790,-237,1411,234,-6417,-699,-858,1646,-5346,-1016,-1311,1490,-6350,-989,-2324,1540,-4858,-1904,-2046,1062,-5291,-1735,-3627,1222,-3865,-2478,-2522,1026,-1450,-1011,437,-1715,-313,-3013,-940,698,-5491,6684,1457,-7375,7700,-4125,1011,528,2546,2275,-302,-832,-6638,5122,-1210,-3340,-750,-982,-160,-318,-4358,943,-498,-777,-1282,-896,-701,-107,-2909,-131,-397,-234,-1553,-520,-1477,-755,-2686,-352,-956,-154,-390,242,-1960,-1999,-2030,-426,-1877,-451,-101,1482,-1170,-4180,-2068,-240,-1578,-556,-903,4025,-3574,-7861,-799,1620,-1446,-2749,2683,6881,-641,1136,5998,12947,-12500,6082,1234,1580,-1750,342,3910,2319,-7568,2004,791,142,-1213,85,2689,570,-5261,1248,806,-385,-889,7,2863,-108,-3930,1114,656,-337,-745,-273,3002,29,-4795,-452,-102,393,-923,-1924,1540,2123,-3898,-3738,-2064,2920,-1299,-3604,-5544,4572,-3526,-9479,-12481,8870,3362,35,-2276,14563,5495,3839,-4119,3758,1768,381,1635,7051,550,6445,-2072,1461,1031,1971,1742,3657,-978,5229,-1845,-13,886,2418,1554,1722,-1053,3821,-3065,-1629,1154,3075,1586,-177,-502,2623,-12994,-2270,1085,7447,-3980,168,1006,-1635,-6495,-5674,179,9896,-13958,4412,1664,-1919};
-static const int B1[] = {12019,-11770,21698,-19615};
-
-static const int W[] = {5580,7870,-409,-1225,-4126,-360,1691,-1471,-164,-2805,187,629,4956,3393,-4754,-1405,-8827,1835,208,-1378,-522,6792,-2802,-3127,4441,-2610,-3221,10321,-1444,5221,575,-9654,-3051,1685,-5320,4268,-5434,146,2679,1592};
-static const int B[] = {414,14614,715,-12774,8092,-16933,-2974,-1177,3292,6596};
-
-*/
-
-
-
-
-int mnist(void);
-
-#endif //__MNIST_INT_H__
--- a/components/ai/onnx/examples/model/mnist-lg.onnx
+++ b/components/ai/onnx/examples/model/mnist-lg.onnx
--- a/components/ai/onnx/examples/model/mnist-sm.onnx
+++ b/components/ai/onnx/examples/model/mnist-sm.onnx
--- a/components/ai/onnx/operator_int/add.c
+++ b/components/ai/onnx/operator_int/add.c
@@ -1,35 +0,0 @@
-#include "onnx.h"
-
-void add(const int *input,              // pointer to vector
-         const int *bias,             // pointer to matrix
-         const uint16_t dim_vec,         // length of the vector
-         int *output)
-{
-    for (int i = 0; i < dim_vec; i++)
-    {
-        output[i] = input[i] + bias[i];
-    }
-}
-
-int* add_layer(Onnx__GraphProto* graph, const int *input, int64_t* shapeInput, int64_t* shapeOutput, const char* layer_name)
-{
-    //assert(graph != NULL && input != NULL && layer_name != "" );
-
-    Onnx__NodeProto* node = onnx_graph_get_node_by_name(graph, layer_name);
-    const char* bias = node->input[1];
-
-    int* B = onnx_graph_get_weights_by_name(graph, bias);
-    int64_t* shapeB =  onnx_graph_get_dims_by_name(graph, bias);
-    if(shapeB == NULL)
-    {
-        return NULL;
-    }
-
-    int* output = (int*) malloc(sizeof(int)*shapeB[0]);
-    memset(output, 0, sizeof(sizeof(int)*shapeB[0]));
-    add(input, B, shapeB[0], output);
-
-    memcpy(shapeInput, shapeOutput, sizeof(int64_t)*3);
-
-    return output;
-}
--- a/components/ai/onnx/operator_int/conv2d.c
+++ b/components/ai/onnx/operator_int/conv2d.c
@@ -1,113 +0,0 @@
-#include "onnx.h"
-
-void conv2D(const int *input,                                                // input image
-            const uint16_t dim_im_in_x,                                        // input image dimention x
-            const uint16_t dim_im_in_y,                                        // input image dimention y
-            const uint16_t ch_im_in,                                           // number of input image channels
-            const int *weight,                                               // kernel weights
-            const uint16_t ch_im_out,                                          // number of filters, i.e., output image channels
-            const uint16_t dim_kernel_x,                                       // filter kernel size x
-            const uint16_t dim_kernel_y,                                       // filter kernel size y
-            const uint16_t padding_x,                                          // padding sizes x
-            const uint16_t padding_y,                                          // padding sizes y
-            const uint16_t stride_x,                                           // stride x
-            const uint16_t stride_y,                                           // stride y
-            const int *bias,                                                 // bias
-            int *output,                                                     // output image
-            const uint16_t dim_im_out_x,                                       // output image dimension x
-            const uint16_t dim_im_out_y                                        // output image dimension y
-)
-{
-    int i, j, k, l, m, n;
-    int conv_out = 0.0f;
-    int in_row, in_col;
-
-    // For each filter
-    for (i = 0; i < ch_im_out; i++)
-    {
-        // For each image dimension
-        for (j = 0; j < dim_im_out_y; j++)
-        {
-            for (k = 0; k < dim_im_out_x; k++)
-            {
-                conv_out = bias[i];
-                // For each kernel dimension
-                for (m = 0; m < dim_kernel_y; m++)
-                {
-                    for (n = 0; n < dim_kernel_x; n++)
-                    {
-                        // if-for implementation
-                        in_row = stride_y * j + m - padding_y;
-                        in_col = stride_x * k + n - padding_x;
-                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
-                        {
-                            // For each input channel
-                            for (l = 0; l < ch_im_in; l++)
-                            {
-                                conv_out += input[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
-                                            weight[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in +
-                                               l];
-                            }
-                        }
-                    }
-                }
-                output[i + (j * dim_im_out_x + k) * ch_im_out] = conv_out;
-            }
-        }
-    }
-}
-
-int* conv2D_layer(Onnx__GraphProto* graph, const int *input, int64_t* shapeInput, int64_t* shapeOutput, const char* layer_name)
-{
-    //assert(graph != NULL && input != NULL && layer_name != "" );
-
-    Onnx__NodeProto* node = onnx_graph_get_node_by_name(graph, layer_name);
-    if(node == NULL)
-    {
-        // layer not found
-        return NULL;
-    }
-    const char* weight = node->input[1];
-    const char* bias = node->input[2];
-
-    // Get weight shape
-    int64_t* shapeW = onnx_graph_get_dims_by_name(graph, weight);
-    if(shapeW == NULL)
-    {
-        return NULL;
-    }
-    int64_t dimW = onnx_graph_get_dim_by_name(graph, weight);
-    if(dimW < 0)
-    {
-        return NULL;
-    }
-
-    // Get weights
-    // NCWH --> NWHC
-    int64_t permW_t[] = { 0, 2, 3, 1};
-    int* W = onnx_graph_get_weights_by_name(graph, weight);
-    if(W == NULL)
-    {
-        return NULL;
-    }
-    int* W_t = transpose(W, shapeW, dimW, permW_t);
-
-    // Get bias
-    int* B = onnx_graph_get_weights_by_name(graph, bias);
-    if(B == NULL)
-    {
-        return NULL;
-    }
-
-    int* output = (int*) malloc(sizeof(int)*shapeW[0]*shapeInput[W_INDEX]*shapeInput[H_INDEX]);
-    memset(output, 0, sizeof(sizeof(int)*shapeW[0]*shapeInput[W_INDEX]*shapeInput[H_INDEX]));
-    conv2D(input, shapeInput[W_INDEX], shapeInput[H_INDEX], shapeW[1], W_t, shapeW[0], shapeW[2], shapeW[3], 1, 1, 1, 1, B, output, shapeInput[W_INDEX], shapeInput[H_INDEX]);
-
-    shapeOutput[W_INDEX] = shapeInput[W_INDEX];
-    shapeOutput[H_INDEX] = shapeInput[H_INDEX];
-    shapeOutput[C_INDEX] = shapeW[0];
-
-    free(W_t);
-
-    return output;
-}
--- a/components/ai/onnx/operator_int/dense.c
+++ b/components/ai/onnx/operator_int/dense.c
@@ -1,19 +0,0 @@
-#include "onnx.h"
-
-void dense(const int *input,              // pointer to vector
-           const int *weight,             // pointer to matrix
-           const uint16_t dim_vec,         // length of the vector
-           const uint16_t num_of_rows,     // numCol of A
-           const int *bias,
-           int *output)                   // output operand
-{
-    for (int i = 0; i < num_of_rows; i++)
-    {
-        int ip_out = bias[i];
-        for (int j = 0; j < dim_vec; j++)
-        {
-            ip_out += input[j] * weight[i * dim_vec + j];
-        }
-        output[i] = ip_out;
-    }
-}
--- a/components/ai/onnx/operator_int/info.c
+++ b/components/ai/onnx/operator_int/info.c
@@ -1,25 +0,0 @@
-#include "onnx.h"
-
-void onnx_tensor_info(const int* A, int64_t* shape, int64_t dim)
-{
-    int elem = 1;
-    for(int i = 0; i < dim; i++)
-    {
-        elem = elem * shape[i];
-    }
-
-    printf("Array size: %d\n", elem);
-    for(int i = 0; i < elem; i++)
-    {
-        printf( "%f ", A[i] );
-        int split = 1;
-        for(int j = dim-1; j > 0; j--)
-        {
-            split = split * shape[j];
-            if( (i+1) % split == 0)
-            {
-                printf("\n");
-            }
-        }
-    }
-}
--- a/components/ai/onnx/operator_int/matmul.c
+++ b/components/ai/onnx/operator_int/matmul.c
@@ -1,63 +0,0 @@
-#include "onnx.h"
-
-void matmul(const int *input,              // pointer to vector
-           const int *weight,             // pointer to matrix
-           const uint16_t dim_vec,         // length of the vector
-           const uint16_t num_of_rows,     // numCol of A
-           int *output)
-{
-    for (int i = 0; i < num_of_rows; i++)
-    {
-        int ip_out = 0;
-        for (int j = 0; j < dim_vec; j++)
-        {
-            ip_out += input[j] * weight[i * dim_vec + j];
-        }
-        output[i] = ip_out;
-    }
-}
-
-int* matmul_layer(Onnx__GraphProto* graph, const int *input, int64_t* shapeInput, int64_t* shapeOutput, const char* layer_name)
-{
-    //assert(graph != NULL && input != NULL && layer_name != "" );
-
-    Onnx__NodeProto* node = onnx_graph_get_node_by_name(graph, layer_name);
-    const char* weight = node->input[1];
-
-    int64_t* shapeW =  onnx_graph_get_dims_by_name(graph, weight);
-    if(shapeW == NULL)
-    {
-        return NULL;
-    }
-    int64_t dimW = onnx_graph_get_dim_by_name(graph, weight);
-    if(dimW < 0)
-    {
-        return NULL;
-    }
-
-    //assert(shapeW[0] == shapeInput[1]);
-
-    int64_t permW_t[] = {1, 0};
-    int* W = onnx_graph_get_weights_by_name(graph, weight);
-    if(W == NULL)
-    {
-        return NULL;
-    }
-    int* W_t = transpose(W, shapeW, dimW, permW_t);
-
-    int* output = (int*) malloc(sizeof(int)*shapeW[1]);
-    if(output == NULL)
-    {
-        // No memory
-        return NULL;
-    }
-    memset(output, 0, sizeof(sizeof(int)*shapeW[1]));
-    matmul(input, W_t, shapeW[0], shapeW[1], output);
-
-    shapeOutput[0] = shapeInput[0];
-    shapeOutput[1] = shapeW[1];
-
-    free(W_t);
-
-    return output;
-}
--- a/components/ai/onnx/operator_int/maxpool.c
+++ b/components/ai/onnx/operator_int/maxpool.c
@@ -1,96 +0,0 @@
-#include "onnx.h"
-
-void maxpool(const int *input,
-             const uint16_t dim_im_in_x,  // input image dimension x or W
-             const uint16_t dim_im_in_y,  // input image dimension y or H
-             const uint16_t ch_im_in,     // number of input image channels
-             const uint16_t dim_kernel_x, // window kernel size
-             const uint16_t dim_kernel_y, // window kernel size
-             const uint16_t padding_x,    // padding sizes
-             const uint16_t padding_y,    // padding sizes
-             const uint16_t stride_x,     // stride
-             const uint16_t stride_y,     // stride
-             const uint16_t dim_im_out_x, // output image dimension x or W
-             const uint16_t dim_im_out_y, // output image dimension y or H
-             int *output)
-{
-    int16_t i_ch_in, i_x, i_y;
-    int16_t k_x, k_y;
-
-    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
-    {
-        for (i_y = 0; i_y < dim_im_out_y; i_y++)
-        {
-            for (i_x = 0; i_x < dim_im_out_x; i_x++)
-            {
-                //int max = FLT_MIN;
-                int max = 0;
-                for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
-                {
-                    for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
-                    {
-                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
-                        {
-                            if (input[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)] > max)
-                            {
-                                max = input[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
-                            }
-                        }
-                    }
-                }
-                output[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = max;
-            }
-        }
-    }
-}
-
-int* maxpool_layer(Onnx__GraphProto* graph, int* input, int64_t* shapeInput, int64_t* shapeOutput, const char* layer_name)
-{
-    //assert(graph != NULL && input != NULL && layer_name != "" );
-
-    Onnx__NodeProto* node = onnx_graph_get_node_by_name(graph, layer_name);
-    if(node == NULL)
-    {
-        // layer not found
-        return NULL;
-    }
-
-    uint16_t kernel_x =  1;
-    uint16_t kernel_y =  1;
-    uint16_t padding_x = 0;
-    uint16_t padding_y = 0;
-    uint16_t stride_x =  1;
-    uint16_t stride_y =  1;
-
-    for(int i = 0; i < node->n_attribute; i++)
-    {
-        if( strcmp(node->attribute[i]->name, "kernel_shape") == 0 )
-        {
-            kernel_x = node->attribute[i]->ints[0];
-            kernel_y = node->attribute[i]->ints[1];
-        }
-        if( strcmp(node->attribute[i]->name, "strides") == 0 )
-        {
-            stride_x = node->attribute[i]->ints[0];
-            stride_y = node->attribute[i]->ints[1];
-        }
-    }
-
-    uint16_t out_x = (shapeInput[W_INDEX] - kernel_x + 2 * padding_x) / stride_x + 1;
-    uint16_t out_y = (shapeInput[H_INDEX] - kernel_y + 2 * padding_y) / stride_y + 1;
-
-    int* output = (int*) malloc(sizeof(int)*out_x*out_y*shapeInput[C_INDEX]);
-    if(output == NULL)
-    {
-        // No memory
-        return NULL;
-    }
-    memset(output, 0, sizeof(sizeof(int)*out_x*out_y*shapeInput[C_INDEX]));
-    maxpool(input, shapeInput[W_INDEX], shapeInput[H_INDEX], shapeInput[C_INDEX], kernel_x, kernel_y, padding_x, padding_y, stride_x, stride_y, out_x, out_y, output);
-
-    shapeOutput[W_INDEX] = out_x;
-    shapeOutput[H_INDEX] = out_y;
-    shapeOutput[C_INDEX] = shapeInput[C_INDEX];
-
-    return output;
-}
--- a/components/ai/onnx/operator_int/model.c
+++ b/components/ai/onnx/operator_int/model.c
@@ -1,84 +0,0 @@
-#include <inttypes.h>
-#include "onnx.h"
-
-int* onnx_model_run(Onnx__ModelProto* model, int* input, int64_t* shapeInput)
-{
-    int64_t* shapeOutput = (int64_t*) malloc(sizeof(int64_t)*3);
-    shapeOutput[0] = -1; shapeOutput[1] = -1; shapeOutput[2] =  -1;
-
-    Onnx__NodeProto* node = onnx_graph_get_node_by_input(model->graph, model->graph->input[0]->name);
-
-    int i = 0;
-    int* output;
-    while(node != NULL)
-    {
-        printf("[%2d] %-10s %-20s ", i++, node->op_type, node->name);
-        if(strcmp(node->op_type, "Conv") == 0)
-        {
-            output = conv2D_layer(model->graph, input, shapeInput, shapeOutput, node->name);
-        }
-        else if(strcmp(node->op_type, "Relu") == 0)
-        {
-            output = relu_layer(model->graph, input, shapeInput, shapeOutput, node->name);
-        }
-        else if(strcmp(node->op_type, "MaxPool") == 0)
-        {
-            output = maxpool_layer(model->graph, input, shapeInput, shapeOutput, node->name);
-        }
-        else if(strcmp(node->op_type, "Softmax") == 0)
-        {
-            output = softmax_layer(model->graph, input, shapeInput, shapeOutput, node->name);
-        }
-        else if(strcmp(node->op_type, "MatMul") == 0)
-        {
-            output = matmul_layer(model->graph, input, shapeInput, shapeOutput, node->name);
-        }
-        else if(strcmp(node->op_type, "Add") == 0)
-        {
-            output = add_layer(model->graph, input, shapeInput, shapeOutput, node->name);
-        }
-        else if(strcmp(node->op_type, "Identity") == 0)
-        {
-            node = onnx_graph_get_node_by_input(model->graph, node->output[0]);
-            printf("\n");
-
-            continue;
-        }
-        else if(strcmp(node->op_type, "Transpose") == 0)
-        {
-            node = onnx_graph_get_node_by_input(model->graph, node->output[0]);
-            printf("\n");
-
-            continue;
-        }
-        else if(strcmp(node->op_type, "Reshape") == 0)
-        {
-            shapeOutput[1] = shapeOutput[0] * shapeOutput[1] * shapeOutput[2]; 
-            shapeOutput[2] = 1; 
-            shapeOutput[0] = 1; 
-            printf("[%2" PRId64 ", %2" PRId64 ", %2" PRId64 "] --> [%2" PRId64 ", %2" PRId64 ", %2" PRId64 "]\n", shapeInput[0], shapeInput[1], shapeInput[2], shapeOutput[0], shapeOutput[1], shapeOutput[2]);
-
-            // free(input);
-            // input = output;
-            memcpy(shapeInput, shapeOutput, sizeof(int64_t)*3);
-
-            node = onnx_graph_get_node_by_input(model->graph, node->output[0]);
-            continue;
-        }
-        else
-        {
-            printf("Unsupported operand: %s\n", node->op_type);
-        }
-        printf("[%2" PRId64 ", %2" PRId64 ", %2" PRId64 "] --> [%2" PRId64 ", %2" PRId64 ", %2" PRId64 "]\n", shapeInput[0], shapeInput[1], shapeInput[2], shapeOutput[0], shapeOutput[1], shapeOutput[2]);
-
-        free(input);
-        input = output;
-        memcpy(shapeInput, shapeOutput, sizeof(int64_t)*3);
-
-        node = onnx_graph_get_node_by_input(model->graph, node->output[0]);
-    }
-    output = input;
-    free(shapeOutput);
-
-    return output;
-}
--- a/components/ai/onnx/operator_int/onnx-parser.c
+++ b/components/ai/onnx/operator_int/onnx-parser.c
@@ -1,284 +0,0 @@
-#include "onnx-parser.h"
-
-const char* onnx_tensor_proto_data_type[] = {
-    "Undefined", 
-    "FLOAT",
-    "UINT8",
-    "INT8",
-    "UINT16",
-    "INT16",
-    "INT32",
-    "INT64",
-    "STRING",
-    "BOOL",
-    "FLOAT16",
-    "DOUBLE",
-    "UINT32",
-    "UINT64",
-    "COMPLEX64",
-    "COMPLEX128"
-};
-/*
-Onnx__ModelProto* onnx_load_model(const char* onnx_file_name)
-{
-    unsigned char* buffer;
-    FILE *fp;
-
-    // Get File Size
-    fp = fopen(onnx_file_name,"rb"); 
-    fseek(fp, 0L, SEEK_END);
-    int sz = ftell(fp);
-    fseek(fp, 0L, SEEK_SET);
-    // printf("File size %s is %d\n", onnx_file_name, sz);
-
-    // Read File
-    buffer = (unsigned char*) malloc(sizeof(unsigned char) * sz);
-    if(buffer == NULL)
-    {
-        printf("Failed to malloc %d bytes memory for %s\n", sz, onnx_file_name);
-        return NULL;
-    }
-    fread(buffer, sz, 1, fp);
-
-    Onnx__ModelProto* model = onnx__model_proto__unpack(NULL, sz, buffer);
-    free(buffer);
-    fclose(fp);
-
-    return model;
-}
-*/
-void onnx_model_info(Onnx__ModelProto* model)
-{
-    printf("---- Model info ----\n");
-    printf("IR Version is %ld\n", model->ir_version);
-    printf("Produceer name is %s\n", model->producer_name);
-    printf("Produceer version is %s\n", model->producer_version);
-    printf("Produceer version is %s\n", model->domain);
-}
-
-void onnx_graph_info(Onnx__GraphProto* graph)
-{
-    printf("---- Graph Info ----\n");
-    
-    // Input
-    printf("---- Graph Input Info ----\n");
-    printf("Graph inputs number: %ld\n", graph->n_input);
-    for(int i = 0; i < graph->n_input; i++)
-    {
-        onnx_graph_input_info(graph->input[i]);
-    }
-
-    // Output
-    printf("---- Graph Output Info ----\n");
-    printf("Graph outputs number: %ld\n", graph->n_output);
-    for(int i = 0; i < graph->n_output; i++)
-    {
-        onnx_graph_output_info(graph->output[i]);
-    }
-
-    // Nodes
-    printf("---- Graph Node Info ----\n");
-    printf("Graph nodes number: %ld\n", graph->n_node);
-    for(int i = 0; i < graph->n_node; i++)
-    {
-        onnx_graph_node_info(graph->node[i]);
-    }
-}
-
-void onnx_graph_info_sorted(Onnx__GraphProto* graph)
-{
-    printf("---- Graph Info ----\n");
-    
-    // Input
-    printf("---- Graph Input Info ----\n");
-    printf("Graph inputs number: %ld\n", graph->n_input);
-    for(int i = 0; i < graph->n_input; i++)
-    {
-        onnx_graph_input_info(graph->input[i]);
-    }
-
-    // Output
-    printf("---- Graph Output Info ----\n");
-    printf("Graph outputs number: %ld\n", graph->n_output);
-    for(int i = 0; i < graph->n_output; i++)
-    {
-        onnx_graph_output_info(graph->output[i]);
-    }
-
-    // Nodes
-    printf("---- Graph Node Info ----\n");
-    printf("Graph nodes number: %ld\n", graph->n_node);
-    Onnx__NodeProto* node = onnx_graph_get_node_by_input(graph, graph->input[0]->name);
-
-    while(node != NULL)
-    {
-        onnx_graph_node_info(node);
-        node = onnx_graph_get_node_by_input(graph, node->output[0]);
-    }
-
-}
-
-void onnx_graph_input_info(Onnx__ValueInfoProto* input)
-{
-    printf("Input name %s\n", input->name);
-
-    Onnx__TypeProto* type = input->type;
-    Onnx__TypeProto__Tensor* tensor_type = type->tensor_type;
-    Onnx__TensorShapeProto* shape = tensor_type->shape;
-
-    printf("Input type %s\n", onnx_tensor_proto_data_type[tensor_type->elem_type]);
-    printf("Input dimension %ld\n", shape->n_dim);
-    
-    for(int i = 0; i < shape->n_dim; i++)
-    {
-        onnx_graph_value_tensor_shape_dimension_info(shape->dim[i]);
-        if( i != shape->n_dim - 1)
-        {
-            printf(" x ");
-        }
-    }
-    printf("\n");
-}
-
-void onnx_graph_value_tensor_shape_dimension_info(Onnx__TensorShapeProto__Dimension* dim)
-{
-    
-    switch (dim->value_case)
-    {
-        case ONNX__TENSOR_SHAPE_PROTO__DIMENSION__VALUE__NOT_SET:
-            printf("?");
-            break;
-        case ONNX__TENSOR_SHAPE_PROTO__DIMENSION__VALUE_DIM_VALUE:
-            printf("%ld",dim->dim_value);
-            break;
-        case ONNX__TENSOR_SHAPE_PROTO__DIMENSION__VALUE_DIM_PARAM:
-            printf("%s",dim->dim_param);
-            break;
-        default:
-            printf("?");
-            break;
-    }
-}
-
-void onnx_graph_output_info(Onnx__ValueInfoProto* output)
-{
-    printf("Output name %s\n", output->name);
-
-    Onnx__TypeProto* type = output->type;
-    Onnx__TypeProto__Tensor* tensor_type = type->tensor_type;
-    Onnx__TensorShapeProto* shape = tensor_type->shape;
-
-    printf("Output type %s\n", onnx_tensor_proto_data_type[tensor_type->elem_type]);
-    printf("Output dimension %ld\n", shape->n_dim);
-    
-    for(int i = 0; i < shape->n_dim; i++)
-    {
-        onnx_graph_value_tensor_shape_dimension_info(shape->dim[i]);
-        if( i != shape->n_dim - 1)
-        {
-            printf(" x ");
-        }
-    }
-    printf("\n");
-}
-
-void onnx_graph_initializer_info(Onnx__TensorProto* initializer)
-{
-    printf("%s: [", initializer->name);    
-    for(int i = 0; i < initializer->n_dims; i++)
-    {
-        printf("%ld, ", initializer->dims[i]);
-    }
-    printf("]\n");
-    
-    printf("%s: [", initializer->name);    
-    for(int i = 0; i < initializer->n_float_data; i++)
-    {
-        printf("%f, ", initializer->float_data[i]);
-    }
-    printf("]\n");
-}
-
-Onnx__NodeProto* onnx_graph_get_node_by_name(Onnx__GraphProto* graph, const char* node_name)
-{
-    for(int i = 0; i < graph->n_node; i++)
-    {
-        Onnx__NodeProto* node = graph->node[i];
-        for(int j = 0; j < node->n_input; j++)
-        {
-            if( strcmp(node->name, node_name) == 0)
-            {
-                return node;
-            }
-        }
-    }
-
-    return NULL;
-}
-
-Onnx__NodeProto* onnx_graph_get_node_by_input(Onnx__GraphProto* graph, const char* input_name)
-{
-    for(int i = 0; i < graph->n_node; i++)
-    {
-        Onnx__NodeProto* node = graph->node[i];
-        for(int j = 0; j < node->n_input; j++)
-        {
-            if( strcmp(node->input[j], input_name) == 0)
-            {
-                return node;
-            }
-        }
-    }
-
-    return NULL;
-}
-
-int* onnx_graph_get_weights_by_name(Onnx__GraphProto* graph, const char* node_name)
-{
-    Onnx__TensorProto** initializer =  graph->initializer;
-
-    for(int i = 0; i < graph->n_initializer; i++)
-    {
-        if( strcmp(graph->initializer[i]->name, node_name) == 0)
-        {
-            return graph->initializer[i]->float_data;
-        }
-    }
-
-    return NULL;
-}
-
-long* onnx_graph_get_dims_by_name(Onnx__GraphProto* graph, const char* node_name)
-{
-    Onnx__TensorProto** initializer =  graph->initializer;
-
-    for(int i = 0; i < graph->n_initializer; i++)
-    {
-        if( strcmp(graph->initializer[i]->name, node_name) == 0)
-        {
-            return graph->initializer[i]->dims;
-        }
-    }
-
-    return NULL;
-}
-
-long onnx_graph_get_dim_by_name(Onnx__GraphProto* graph, const char* node_name)
-{
-    Onnx__TensorProto** initializer =  graph->initializer;
-
-    for(int i = 0; i < graph->n_initializer; i++)
-    {
-        if( strcmp(graph->initializer[i]->name, node_name) == 0)
-        {
-            return graph->initializer[i]->n_dims;
-        }
-    }
-
-    return -1;
-}
-
-void onnx_graph_node_info(Onnx__NodeProto* node)
-{
-    printf("%-12s: %-30s ->    %-30s [%s]\n", node->op_type, node->input[0], node->output[0], node->name);
-}
--- a/components/ai/onnx/operator_int/onnx-parser.h
+++ b/components/ai/onnx/operator_int/onnx-parser.h
@@ -1,27 +0,0 @@
-#ifndef __ONNX_PARSER_H__
-#define __ONNX_PARSER_H__
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "onnx.pb-c.h"
-
-Onnx__ModelProto* onnx_load_model(const char* onnx_file_name);
-void onnx_model_info(Onnx__ModelProto* model);
-void onnx_graph_info(Onnx__GraphProto* graph);
-void onnx_graph_info_sorted(Onnx__GraphProto* graph);
-void onnx_graph_input_info(Onnx__ValueInfoProto* input);
-void onnx_graph_output_info(Onnx__ValueInfoProto* output);
-void onnx_graph_node_info(Onnx__NodeProto* node);
-void onnx_graph_initializer_info(Onnx__TensorProto* initializer);
-
-Onnx__NodeProto* onnx_graph_get_node_by_name(Onnx__GraphProto* graph, const char* node_name);
-Onnx__NodeProto* onnx_graph_get_node_by_input(Onnx__GraphProto* graph, const char* input_name);
-long* onnx_graph_get_dims_by_name(Onnx__GraphProto* graph, const char* node_name);
-long onnx_graph_get_dim_by_name(Onnx__GraphProto* graph, const char* node_name);
-int* onnx_graph_get_weights_by_name(Onnx__GraphProto* graph, const char* node_name);
-
-void onnx_graph_value_tensor_shape_dimension_info(Onnx__TensorShapeProto__Dimension* dim);
-
-#endif //__ONNX_PARSER_H__
--- a/components/ai/onnx/operator_int/onnx.h
+++ b/components/ai/onnx/operator_int/onnx.h
@@ -1,95 +0,0 @@
-#ifndef __ONNX_H__
-#define __ONNX_H__
-
-#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
-//#include <int.h>
-#include <math.h>
-
-#include <onnx-parser.h>
-
-#define ONNX_USE_NWHC
-
-#ifdef  ONNX_USE_NWHC
-    // NWHC
-    #define W_INDEX 0
-    #define H_INDEX 1
-    #define C_INDEX 2
-#else
-    // NCWH
-    #define C_INDEX 0
-    #define W_INDEX 1
-    #define H_INDEX 2
-#endif
-
-// Model
-void onnx_tensor_info(const int* A, int64_t* shape, int64_t dim);
-int* onnx_model_run(Onnx__ModelProto* model, int* input, int64_t* shapeInput);
-
-// Layers
-int* conv2D_layer(Onnx__GraphProto* graph, const int *input, int64_t* shapeInput, int64_t* shapeOutput, const char* layer_name);
-int* relu_layer(Onnx__GraphProto* graph, const int *input, int64_t* shapeInput, int64_t* shapeOutput, const char* layer_name);
-int* maxpool_layer(Onnx__GraphProto* graph, int* input, int64_t* shapeInput, int64_t* shapeOutput, const char* layer_name);
-int* matmul_layer(Onnx__GraphProto* graph, const int *input, int64_t* shapeInput, int64_t* shapeOutput, const char* layer_name);
-int* add_layer(Onnx__GraphProto* graph, const int *input, int64_t* shapeInput, int64_t* shapeOutput, const char* layer_name);
-int* softmax_layer(Onnx__GraphProto* graph, const int *input, int64_t* shapeInput, int64_t* shapeOutput, const char* layer_name);
-
-// Operators
-int* transpose(const int* A, int64_t* shape, int64_t dim, int64_t* perm);
-
-void conv2D(const int *input,                                                // input image
-            const uint16_t dim_im_in_x,                                        // input image dimention x
-            const uint16_t dim_im_in_y,                                        // input image dimention y
-            const uint16_t ch_im_in,                                           // number of input image channels
-            const int *weight,                                               // kernel weights
-            const uint16_t ch_im_out,                                          // number of filters, i.e., output image channels
-            const uint16_t dim_kernel_x,                                       // filter kernel size x
-            const uint16_t dim_kernel_y,                                       // filter kernel size y
-            const uint16_t padding_x,                                          // padding sizes x
-            const uint16_t padding_y,                                          // padding sizes y
-            const uint16_t stride_x,                                           // stride x
-            const uint16_t stride_y,                                           // stride y
-            const int *bias,                                                 // bias
-            int *output,                                                     // output image
-            const uint16_t dim_im_out_x,                                       // output image dimension x
-            const uint16_t dim_im_out_y                                        // output image dimension y
-);
-
-void relu(const int *input, uint32_t size, int* output);
-
-void maxpool(const int *input,
-             const uint16_t dim_im_in_x,    // input image dimension x or W
-             const uint16_t dim_im_in_y,    // input image dimension y or H
-             const uint16_t ch_im_in,       // number of input image channels
-             const uint16_t dim_kernel_x,   // window kernel size
-             const uint16_t dim_kernel_y,   // window kernel size
-             const uint16_t padding_x,      // padding sizes
-             const uint16_t padding_y,      // padding sizes
-             const uint16_t stride_x,       // stride
-             const uint16_t stride_y,       // stride
-             const uint16_t dim_im_out_x,   // output image dimension x or W
-             const uint16_t dim_im_out_y,   // output image dimension y or H
-             int *output);
-
-void matmul(const int *input,             // pointer to vector
-           const int *weight,             // pointer to matrix
-           const uint16_t dim_vec,          // length of the vector
-           const uint16_t num_of_rows,      // numCol of A
-           int *output);
-
-void add(const int *input,                // pointer to vector
-           const int *bias,               // pointer to matrix
-           const uint16_t dim_vec,          // length of the vector
-           int *output);
-
-void dense(const int *input,              // pointer to vector
-           const int *weight,             // pointer to matrix
-           const uint16_t dim_vec,          // length of the vector
-           const uint16_t num_of_rows,      // numCol of A
-           const int *bias,
-           int *output);
-
-void softmax(const int *input, const uint32_t dim_vec, int *output);
-
-#endif // __ONNX_H__
--- a/components/ai/onnx/operator_int/relu.c
+++ b/components/ai/onnx/operator_int/relu.c
@@ -1,27 +0,0 @@
-#include "onnx.h"
-
-void relu(const int *input, uint32_t size, int* output)
-{
-    uint32_t i;
-    memcpy(output, input, sizeof(int) * size);
-    for (i = 0; i < size; i++)
-    {
-        if (output[i] < 0)
-            output[i] = 0;
-    }
-}
-
-int* relu_layer(Onnx__GraphProto* graph, const int *input, int64_t* shapeInput, int64_t* shapeOutput, const char* layer_name)
-{
-    //assert(graph != NULL && input != NULL && layer_name != "" );
-
-    int64_t len = shapeInput[0] * shapeInput[1] * shapeInput[2];
-    int* output = (int*) malloc(sizeof(int)*len);
-    memset(output, 0, sizeof(sizeof(int)*len));
-
-    relu(input, len, output);
-
-    memcpy(shapeInput, shapeOutput, sizeof(int64_t)*3);
-
-    return output;
-}
--- a/components/ai/onnx/operator_int/softmax.c
+++ b/components/ai/onnx/operator_int/softmax.c
@@ -1,57 +0,0 @@
-#include "onnx.h"
-//#include<math.h>
-
-
-int abs_core(int x)
-{
-    return x > 0?x:-x;
-}
-
-int exp_core(int x)
-{
-           x = 1 + (x << 8);
-
-           x *= x;       x *= x;       x *= x;       x *= x;
-
-           x *= x;       x *= x;       x *= x;       x *= x;
-
-           return x;
-
-}
-
-void softmax(const int *input, const uint32_t dim_vec, int *output)
-{
-    long long sum = 0;
-
-    for(int i = 0; i < dim_vec; i++)
-    {
-        output[i] = input[i] >> 16;
-    }
-
-    /*
-    for(int i = 0; i < dim_vec; i++)
-    {
-        output[i] = abs_core(input[i] >> 16);
-        sum = sum + (output[i]);
-    }
-    printf("sum = %ld\r\n" , sum);
-    for(int i = 0; i < dim_vec; i++)
-    {
-        //output[i] = output[i] / (sum);
-        output[i] = sum / output[i];
-        //output[i] = output[i];
-    }*/
-}
-
-int* softmax_layer(Onnx__GraphProto* graph, const int *input, int64_t* shapeInput, int64_t* shapeOutput, const char* layer_name)
-{
-    //assert(graph != NULL && input != NULL && layer_name != "" && shapeInput[1] > 0);
-
-    int* output = (int*) malloc(sizeof(int)*shapeInput[1]);
-    memset(output, 0, sizeof(sizeof(int)*shapeInput[1]));
-    softmax(input, shapeInput[1], output);
-
-    memcpy(shapeInput, shapeOutput, sizeof(int64_t)*3);
-
-    return output;
-}
--- a/components/ai/onnx/operator_int/tencentos_libc.c
+++ b/components/ai/onnx/operator_int/tencentos_libc.c
@@ -1,26 +0,0 @@
-#include "tos_k.h"
-
-#ifdef __CC_ARM
-/* avoid the heap and heap-using library functions supplied by arm */
-#pragma import(__use_no_heap)
-#endif
-
-void *malloc(size_t n)
-{
-    return tos_mmheap_alloc(n);
-}
-
-void *realloc(void *rmem, size_t newsize)
-{
-    return tos_mmheap_realloc(rmem, newsize);
-}
-
-void *calloc(size_t nelem, size_t elsize)
-{
-    return tos_mmheap_calloc(nelem, elsize);
-}
-
-void free(void *rmem)
-{
-    tos_mmheap_free(rmem);
-}
--- a/components/ai/onnx/operator_int/transpose.c
+++ b/components/ai/onnx/operator_int/transpose.c
@@ -1,97 +0,0 @@
-#include "onnx.h"
-
-int* transpose(const int* A, int64_t* shape, int64_t dim, int64_t* perm)
-{
-    // Get array size
-    int elem = 1;
-    for(int i = 0; i < dim; i++)
-    {
-        elem = elem * shape[i];
-    }
-
-    // Malloc memory for B
-    int* B = malloc(sizeof(int) * elem);
-    if(B == NULL)
-    {
-        return NULL;
-    }
-    // Malloc memory for shapeB
-    int* shapeB = malloc(sizeof(int) * dim);
-    if( shapeB == NULL)
-    {
-        return NULL;
-    }
-    for(int i = 0; i < dim; i++)
-    {
-        shapeB[i] = shape[perm[i]];
-    }
-    // Transpose
-    for(int src = 0; src < elem; src++)
-    {
-        // Get transposed B array
-        // A[1][0][3] -> B[3][1][0]
-        int temp = src;
-        int* indexA = malloc(sizeof(int) * dim);
-        if(indexA == NULL)
-        {
-            return NULL;
-        }
-        int* indexB = malloc(sizeof(int) * dim);
-        if(indexB == NULL)
-        {
-            return NULL;
-        }
-        for(int i = dim-1; i >= 0; i--)
-        {
-            indexA[i] = temp % shape[i];
-            temp = temp / shape[i];
-        }
-        for(int i = 0; i < dim; i++)
-        {
-            indexB[i] = indexA[perm[i]];
-        }
-
-        // Get transposed B index 
-        // #15 A[1][0][3] -> B[3][1][0] #21
-        int dst = 0;
-        temp = 1;
-        for(int i = dim - 1; i >= 0; i--)
-        {
-            dst = dst + indexB[i] * temp;
-            temp = temp * shapeB[i];
-        }
-
-        B[dst] = A[src];
-
-        free(indexA);
-        free(indexB);
-    }
-
-    free(shapeB);
-    return B;
-}
-
-int* transpose_layer(Onnx__GraphProto* graph, const int *input, int64_t* shapeInput, int64_t* shapeOutput, const char* layer_name)
-{
-    //assert(graph != NULL && input != NULL && layer_name != "" );
-
-    Onnx__NodeProto* node = onnx_graph_get_node_by_name(graph, layer_name);
-    if(node == NULL)
-    {
-        return NULL;
-    }
-
-    int64_t perm_t[3];
-    int64_t* perm = node->attribute[0]->ints;
-    perm_t[0] = perm[1] - 1;
-    perm_t[1] = perm[2] - 1;
-    perm_t[2] = perm[3] - 1;
-
-    int* output = transpose(input, shapeInput, 3, perm_t);
-
-    shapeOutput[0] = shapeInput[perm_t[0]];
-    shapeOutput[1] = shapeInput[perm_t[1]];
-    shapeOutput[2] = shapeInput[perm_t[2]];
-
-    return output;
-}
--- a/components/ai/onnx/pic/mnist_test.png
+++ b/components/ai/onnx/pic/mnist_test.png
--- a/components/ai/onnx/platflorm/imx6ull/tencentos_libc_malloc.c
+++ b/components/ai/onnx/platflorm/imx6ull/tencentos_libc_malloc.c
@@ -1,26 +0,0 @@
-#include "tos_k.h"
-
-#ifdef __CC_ARM
-/* avoid the heap and heap-using library functions supplied by arm */
-#pragma import(__use_no_heap)
-#endif
-
-void *malloc(size_t n)
-{
-    return tos_mmheap_alloc(n);
-}
-
-void *realloc(void *rmem, size_t newsize)
-{
-    return tos_mmheap_realloc(rmem, newsize);
-}
-
-void *calloc(size_t nelem, size_t elsize)
-{
-    return tos_mmheap_calloc(nelem, elsize);
-}
-
-void free(void *rmem)
-{
-    tos_mmheap_free(rmem);
-}
--- a/components/ai/onnx/protobuf/onnx.pb-c.c
+++ b/components/ai/onnx/protobuf/onnx.pb-c.c
--- a/components/ai/onnx/protobuf/onnx.pb-c.h
+++ b/components/ai/onnx/protobuf/onnx.pb-c.h
@@ -1,981 +0,0 @@
-/* Generated by the protocol buffer compiler.  DO NOT EDIT! */
-/* Generated from: src/onnx.proto */
-
-#ifndef PROTOBUF_C_src_2fonnx_2eproto__INCLUDED
-#define PROTOBUF_C_src_2fonnx_2eproto__INCLUDED
-
-#include <protobuf-c.h>
-
-PROTOBUF_C__BEGIN_DECLS
-
-#if PROTOBUF_C_VERSION_NUMBER < 1000000
-# error This file was generated by a newer version of protoc-c which is incompatible with your libprotobuf-c headers. Please update your headers.
-#elif 1002001 < PROTOBUF_C_MIN_COMPILER_VERSION
-# error This file was generated by an older version of protoc-c which is incompatible with your libprotobuf-c headers. Please regenerate this file with a newer version of protoc-c.
-#endif
-
-
-typedef struct _Onnx__AttributeProto Onnx__AttributeProto;
-typedef struct _Onnx__ValueInfoProto Onnx__ValueInfoProto;
-typedef struct _Onnx__NodeProto Onnx__NodeProto;
-typedef struct _Onnx__ModelProto Onnx__ModelProto;
-typedef struct _Onnx__StringStringEntryProto Onnx__StringStringEntryProto;
-typedef struct _Onnx__GraphProto Onnx__GraphProto;
-typedef struct _Onnx__TensorProto Onnx__TensorProto;
-typedef struct _Onnx__TensorProto__Segment Onnx__TensorProto__Segment;
-typedef struct _Onnx__TensorShapeProto Onnx__TensorShapeProto;
-typedef struct _Onnx__TensorShapeProto__Dimension Onnx__TensorShapeProto__Dimension;
-typedef struct _Onnx__TypeProto Onnx__TypeProto;
-typedef struct _Onnx__TypeProto__Tensor Onnx__TypeProto__Tensor;
-typedef struct _Onnx__OperatorSetIdProto Onnx__OperatorSetIdProto;
-
-
-/* --- enums --- */
-
-/*
- * Note: this enum is structurally identical to the OpSchema::AttrType
- * enum defined in schema.h.  If you rev one, you likely need to rev the other.
- */
-typedef enum _Onnx__AttributeProto__AttributeType {
-  ONNX__ATTRIBUTE_PROTO__ATTRIBUTE_TYPE__UNDEFINED = 0,
-  ONNX__ATTRIBUTE_PROTO__ATTRIBUTE_TYPE__FLOAT = 1,
-  ONNX__ATTRIBUTE_PROTO__ATTRIBUTE_TYPE__INT = 2,
-  ONNX__ATTRIBUTE_PROTO__ATTRIBUTE_TYPE__STRING = 3,
-  ONNX__ATTRIBUTE_PROTO__ATTRIBUTE_TYPE__TENSOR = 4,
-  ONNX__ATTRIBUTE_PROTO__ATTRIBUTE_TYPE__GRAPH = 5,
-  ONNX__ATTRIBUTE_PROTO__ATTRIBUTE_TYPE__FLOATS = 6,
-  ONNX__ATTRIBUTE_PROTO__ATTRIBUTE_TYPE__INTS = 7,
-  ONNX__ATTRIBUTE_PROTO__ATTRIBUTE_TYPE__STRINGS = 8,
-  ONNX__ATTRIBUTE_PROTO__ATTRIBUTE_TYPE__TENSORS = 9,
-  ONNX__ATTRIBUTE_PROTO__ATTRIBUTE_TYPE__GRAPHS = 10
-    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(ONNX__ATTRIBUTE_PROTO__ATTRIBUTE_TYPE)
-} Onnx__AttributeProto__AttributeType;
-typedef enum _Onnx__TensorProto__DataType {
-  ONNX__TENSOR_PROTO__DATA_TYPE__UNDEFINED = 0,
-  /*
-   * Basic types.
-   */
-  /*
-   * int
-   */
-  ONNX__TENSOR_PROTO__DATA_TYPE__FLOAT = 1,
-  /*
-   * uint8_t
-   */
-  ONNX__TENSOR_PROTO__DATA_TYPE__UINT8 = 2,
-  /*
-   * int8_t
-   */
-  ONNX__TENSOR_PROTO__DATA_TYPE__INT8 = 3,
-  /*
-   * uint16_t
-   */
-  ONNX__TENSOR_PROTO__DATA_TYPE__UINT16 = 4,
-  /*
-   * int16_t
-   */
-  ONNX__TENSOR_PROTO__DATA_TYPE__INT16 = 5,
-  /*
-   * int32_t
-   */
-  ONNX__TENSOR_PROTO__DATA_TYPE__INT32 = 6,
-  /*
-   * int64_t
-   */
-  ONNX__TENSOR_PROTO__DATA_TYPE__INT64 = 7,
-  /*
-   * string
-   */
-  ONNX__TENSOR_PROTO__DATA_TYPE__STRING = 8,
-  /*
-   * bool
-   */
-  ONNX__TENSOR_PROTO__DATA_TYPE__BOOL = 9,
-  /*
-   * Advanced types
-   */
-  ONNX__TENSOR_PROTO__DATA_TYPE__FLOAT16 = 10,
-  ONNX__TENSOR_PROTO__DATA_TYPE__DOUBLE = 11,
-  ONNX__TENSOR_PROTO__DATA_TYPE__UINT32 = 12,
-  ONNX__TENSOR_PROTO__DATA_TYPE__UINT64 = 13,
-  /*
-   * complex with float32 real and imaginary components
-   */
-  ONNX__TENSOR_PROTO__DATA_TYPE__COMPLEX64 = 14,
-  /*
-   * complex with float64 real and imaginary components
-   */
-  ONNX__TENSOR_PROTO__DATA_TYPE__COMPLEX128 = 15
-    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(ONNX__TENSOR_PROTO__DATA_TYPE)
-} Onnx__TensorProto__DataType;
-/*
- * Versioning
- * ONNX versioning is specified in docs/IR.md and elaborated on in docs/Versioning.md
- * To be compatible with both proto2 and proto3, we will use a version number
- * that is not defined by the default value but an explicit enum number.
- */
-typedef enum _Onnx__Version {
-  /*
-   * proto3 requires the first enum value to be zero.
-   * We add this just to appease the compiler.
-   */
-  ONNX__VERSION___START_VERSION = 0,
-  /*
-   * The version field is always serialized and we will use it to store the
-   * version that the  graph is generated from. This helps us set up version
-   * control. 
-   * For the IR, we are using simple numbers starting with with 0x00000001, 
-   * which was the version we published on Oct 10, 2017.
-   */
-  ONNX__VERSION__IR_VERSION_2017_10_10 = 1,
-  /*
-   * IR_VERSION 2 published on Oct 30, 2017
-   * - Added type discriminator to AttributeProto to support proto3 users
-   */
-  ONNX__VERSION__IR_VERSION_2017_10_30 = 2,
-  /*
-   * IR VERSION 3 published on Nov 3, 2017
-   * - For operator versioning:
-   *    - Added new message OperatorSetIdProto
-   *    - Added opset_import in ModelProto
-   * - For vendor extensions, added domain in NodeProto
-   */
-  ONNX__VERSION__IR_VERSION = 3
-    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(ONNX__VERSION)
-} Onnx__Version;
-
-/* --- messages --- */
-
-/*
- * Attributes
- * A named attribute containing either singular int, integer, string, graph,
- * and tensor values, or repeated int, integer, string, graph, and tensor values.
- * An AttributeProto MUST contain the name field, and *only one* of the
- * following content fields, effectively enforcing a C/C++ union equivalent.
- */
-struct  _Onnx__AttributeProto
-{
-  ProtobufCMessage base;
-  /*
-   * The name field MUST be present for this version of the IR.
-   */
-  /*
-   * namespace Attribute
-   */
-  char *name;
-  /*
-   * if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function.
-   * In this case, this AttributeProto does not contain data, and it's a reference of attribute
-   * in parent scope.
-   * NOTE: This should ONLY be used in function (sub-graph). It's invalid to be used in main graph.
-   */
-  char *ref_attr_name;
-  /*
-   * A human-readable documentation for this attribute. Markdown is allowed.
-   */
-  char *doc_string;
-  /*
-   * The type field MUST be present for this version of the IR.
-   * For 0.0.1 versions of the IR, this field was not defined, and
-   * implementations needed to use has_field hueristics to determine
-   * which value field was in use.  For IR_VERSION 0.0.2 or later, this
-   * field MUST be set and match the f|i|s|t|... field in use.  This
-   * change was made to accomodate proto3 implementations.
-   */
-  /*
-   * discriminator that indicates which field below is in use
-   */
-  protobuf_c_boolean has_type;
-  Onnx__AttributeProto__AttributeType type;
-  /*
-   * Exactly ONE of the following fields must be present for this version of the IR
-   */
-  /*
-   * int
-   */
-  protobuf_c_boolean has_f;
-  int f;
-  /*
-   * int
-   */
-  protobuf_c_boolean has_i;
-  int64_t i;
-  /*
-   * UTF-8 string
-   */
-  protobuf_c_boolean has_s;
-  ProtobufCBinaryData s;
-  /*
-   * tensor value
-   */
-  Onnx__TensorProto *t;
-  /*
-   * graph
-   */
-  Onnx__GraphProto *g;
-  /*
-   * list of floats
-   */
-  size_t n_floats;
-  int *floats;
-  /*
-   * list of ints
-   */
-  size_t n_ints;
-  int64_t *ints;
-  /*
-   * list of UTF-8 strings
-   */
-  size_t n_strings;
-  ProtobufCBinaryData *strings;
-  /*
-   * list of tensors
-   */
-  size_t n_tensors;
-  Onnx__TensorProto **tensors;
-  /*
-   * list of graph
-   */
-  size_t n_graphs;
-  Onnx__GraphProto **graphs;
-};
-#define ONNX__ATTRIBUTE_PROTO__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&onnx__attribute_proto__descriptor) \
-    , NULL, NULL, NULL, 0,0, 0,0, 0,0, 0,{0,NULL}, NULL, NULL, 0,NULL, 0,NULL, 0,NULL, 0,NULL, 0,NULL }
-
-
-/*
- * Defines information on value, including the name, the type, and
- * the shape of the value.
- */
-struct  _Onnx__ValueInfoProto
-{
-  ProtobufCMessage base;
-  /*
-   * This field MUST be present in this version of the IR.
-   */
-  /*
-   * namespace Value
-   */
-  char *name;
-  /*
-   * This field MUST be present in this version of the IR.
-   */
-  Onnx__TypeProto *type;
-  /*
-   * A human-readable documentation for this value. Markdown is allowed.
-   */
-  char *doc_string;
-};
-#define ONNX__VALUE_INFO_PROTO__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&onnx__value_info_proto__descriptor) \
-    , NULL, NULL, NULL }
-
-
-/*
- * Nodes
- * Computation graphs are made up of a DAG of nodes, which represent what is
- * commonly called a "layer" or "pipeline stage" in machine learning frameworks.
- * For example, it can be a node of type "Conv" that takes in an image, a filter 
- * tensor and a bias tensor, and produces the convolved output.
- */
-struct  _Onnx__NodeProto
-{
-  ProtobufCMessage base;
-  /*
-   * namespace Value
-   */
-  size_t n_input;
-  char **input;
-  /*
-   * namespace Value
-   */
-  size_t n_output;
-  char **output;
-  /*
-   * An optional identifier for this node in a graph.
-   * This field MAY be absent in ths version of the IR.
-   */
-  /*
-   * namespace Node
-   */
-  char *name;
-  /*
-   * The symbolic identifier of the Operator to execute.
-   */
-  /*
-   * namespace Operator
-   */
-  char *op_type;
-  /*
-   * The domain of the OperatorSet that specifies the operator named by op_type.
-   */
-  /*
-   * namespace Domain
-   */
-  char *domain;
-  /*
-   * Additional named attributes.
-   */
-  size_t n_attribute;
-  Onnx__AttributeProto **attribute;
-  /*
-   * A human-readable documentation for this node. Markdown is allowed.
-   */
-  char *doc_string;
-};
-#define ONNX__NODE_PROTO__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&onnx__node_proto__descriptor) \
-    , 0,NULL, 0,NULL, NULL, NULL, NULL, 0,NULL, NULL }
-
-
-/*
- * Models
- * ModelProto is a top-level file/container format for bundling a ML model and
- * associating its computation graph with metadata.
- * The semantics of the model are described by the associated GraphProto.
- */
-struct  _Onnx__ModelProto
-{
-  ProtobufCMessage base;
-  /*
-   * The version of the IR this model targets. See Version enum above.
-   * This field MUST be present.
-   */
-  protobuf_c_boolean has_ir_version;
-  int64_t ir_version;
-  /*
-   * The OperatorSets this model relies on.
-   * All ModelProtos MUST have at least one entry that
-   * specifies which version of the ONNX OperatorSet is
-   * being imported.
-   * All nodes in the ModelProto's graph will bind against the operator
-   * with the same-domain/same-op_type operator with the HIGHEST version
-   * in the referenced operator sets.
-   */
-  size_t n_opset_import;
-  Onnx__OperatorSetIdProto **opset_import;
-  /*
-   * The name of the framework or tool used to generate this model.
-   * This field SHOULD be present to indicate which implementation/tool/framework
-   * emitted the model.
-   */
-  char *producer_name;
-  /*
-   * The version of the framework or tool used to generate this model.
-   * This field SHOULD be present to indicate which implementation/tool/framework
-   * emitted the model.
-   */
-  char *producer_version;
-  /*
-   * Domain name of the model.
-   * We use reverse domain names as name space indicators. For example:
-   * `com.facebook.fair` or `com.microsoft.cognitiveservices`
-   * Together with `model_version` and GraphProto.name, this forms the unique identity of
-   * the graph.
-   */
-  char *domain;
-  /*
-   * The version of the graph encoded. See Version enum below.
-   */
-  protobuf_c_boolean has_model_version;
-  int64_t model_version;
-  /*
-   * A human-readable documentation for this model. Markdown is allowed.
-   */
-  char *doc_string;
-  /*
-   * The parameterized graph that is evaluated to execute the model.
-   */
-  Onnx__GraphProto *graph;
-  /*
-   * Named metadata values; keys should be distinct.
-   */
-  size_t n_metadata_props;
-  Onnx__StringStringEntryProto **metadata_props;
-};
-#define ONNX__MODEL_PROTO__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&onnx__model_proto__descriptor) \
-    , 0,0, 0,NULL, NULL, NULL, NULL, 0,0, NULL, NULL, 0,NULL }
-
-
-/*
- * StringStringEntryProto follows the pattern for cross-proto-version maps.
- * See https://developers.google.com/protocol-buffers/docs/proto3#maps
- */
-struct  _Onnx__StringStringEntryProto
-{
-  ProtobufCMessage base;
-  char *key;
-  char *value;
-};
-#define ONNX__STRING_STRING_ENTRY_PROTO__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&onnx__string_string_entry_proto__descriptor) \
-    , NULL, NULL }
-
-
-/*
- * Graphs
- * A graph defines the computational logic of a model and is comprised of a parameterized 
- * list of nodes that form a directed acyclic graph based on their inputs and outputs.
- * This is the equivalent of the "network" or "graph" in many deep learning
- * frameworks.
- */
-struct  _Onnx__GraphProto
-{
-  ProtobufCMessage base;
-  /*
-   * The nodes in the graph, sorted topologically.
-   */
-  size_t n_node;
-  Onnx__NodeProto **node;
-  /*
-   * The name of the graph.
-   */
-  /*
-   * namespace Graph
-   */
-  char *name;
-  /*
-   * A list of named tensor values, used to specify constant inputs of the graph.
-   * Each TensorProto entry must have a distinct name (within the list) that
-   * also appears in the input list.
-   */
-  size_t n_initializer;
-  Onnx__TensorProto **initializer;
-  /*
-   * A human-readable documentation for this graph. Markdown is allowed.
-   */
-  char *doc_string;
-  /*
-   * The inputs and outputs of the graph.
-   */
-  size_t n_input;
-  Onnx__ValueInfoProto **input;
-  size_t n_output;
-  Onnx__ValueInfoProto **output;
-  /*
-   * Information for the values in the graph. The ValueInfoProto.name's
-   * must be distinct. It is optional for a value to appear in value_info list.
-   */
-  size_t n_value_info;
-  Onnx__ValueInfoProto **value_info;
-};
-#define ONNX__GRAPH_PROTO__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&onnx__graph_proto__descriptor) \
-    , 0,NULL, NULL, 0,NULL, NULL, 0,NULL, 0,NULL, 0,NULL }
-
-
-/*
- * For very large tensors, we may want to store them in chunks, in which
- * case the following fields will specify the segment that is stored in
- * the current TensorProto.
- */
-struct  _Onnx__TensorProto__Segment
-{
-  ProtobufCMessage base;
-  protobuf_c_boolean has_begin;
-  int64_t begin;
-  protobuf_c_boolean has_end;
-  int64_t end;
-};
-#define ONNX__TENSOR_PROTO__SEGMENT__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&onnx__tensor_proto__segment__descriptor) \
-    , 0,0, 0,0 }
-
-
-/*
- * Tensors
- * A serialized tensor value.
- */
-struct  _Onnx__TensorProto
-{
-  ProtobufCMessage base;
-  /*
-   * The shape of the tensor.
-   */
-  size_t n_dims;
-  int64_t *dims;
-  /*
-   * The data type of the tensor.
-   */
-  protobuf_c_boolean has_data_type;
-  Onnx__TensorProto__DataType data_type;
-  Onnx__TensorProto__Segment *segment;
-  /*
-   * For int and complex64 values
-   * Complex64 tensors are encoded as a single array of floats,
-   * with the real components appearing in odd numbered positions,
-   * and the corresponding imaginary component apparing in the
-   * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
-   * is encoded as [1.0, 2.0 ,3.0 ,4.0]
-   * When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
-   */
-  size_t n_float_data;
-  int *float_data;
-  /*
-   * For int32, uint8, int8, uint16, int16, bool, and float16 values
-   * float16 values must be bit-wise converted to an uint16_t prior
-   * to writing to the buffer.
-   * When this field is present, the data_type field MUST be
-   * INT32, INT16, INT8, UINT16, INT8, BOOL, or FLOAT16
-   */
-  size_t n_int32_data;
-  int32_t *int32_data;
-  /*
-   * For strings.
-   * Each element of string_data is a UTF-8 encoded Unicode
-   * string. No trailing null, no leading BOM. The protobuf "string"
-   * scalar type is not used to match ML community conventions.
-   * When this field is present, the data_type field MUST be STRING
-   */
-  size_t n_string_data;
-  ProtobufCBinaryData *string_data;
-  /*
-   * For int64.
-   * When this field is present, the data_type field MUST be INT64
-   */
-  size_t n_int64_data;
-  int64_t *int64_data;
-  /*
-   * Optionally, a name for the tensor.
-   */
-  /*
-   * namespace Value
-   */
-  char *name;
-  /*
-   * A human-readable documentation for this tensor. Markdown is allowed.
-   */
-  char *doc_string;
-  /*
-   * Serializations can either use one of the fields above, or use this
-   * raw bytes field. The only exception is the string case, where one is
-   * required to store the content in the repeated bytes string_data field.
-   * When this raw_data field is used to store tensor value, elements MUST
-   * be stored in as fixed-width, little-endian order.
-   * Floating-point data types MUST be stored in IEEE 754 format.
-   * Complex64 elements must be written as two consecutive FLOAT values, real component first.
-   * Complex128 elements must be written as two consecutive DOUBLE values, real component first.
-   * Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false).
-   * Note: the advantage of specific field rather than the raw_data field is
-   * that in some cases (e.g. int data), protobuf does a better packing via
-   * variable length storage, and may lead to smaller binary footprint.
-   * When this field is present, the data_type field MUST NOT be STRING or UNDEFINED
-   */
-  protobuf_c_boolean has_raw_data;
-  ProtobufCBinaryData raw_data;
-  /*
-   * For double
-   * Complex128 tensors are encoded as a single array of doubles,
-   * with the real components appearing in odd numbered positions,
-   * and the corresponding imaginary component apparing in the
-   * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
-   * is encoded as [1.0, 2.0 ,3.0 ,4.0]
-   * When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
-   */
-  size_t n_double_data;
-  double *double_data;
-  /*
-   * For uint64 and uint32 values
-   * When this field is present, the data_type field MUST be
-   * UINT32 or UINT64
-   */
-  size_t n_uint64_data;
-  uint64_t *uint64_data;
-};
-#define ONNX__TENSOR_PROTO__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&onnx__tensor_proto__descriptor) \
-    , 0,NULL, 0,0, NULL, 0,NULL, 0,NULL, 0,NULL, 0,NULL, NULL, NULL, 0,{0,NULL}, 0,NULL, 0,NULL }
-
-
-typedef enum {
-  ONNX__TENSOR_SHAPE_PROTO__DIMENSION__VALUE__NOT_SET = 0,
-  ONNX__TENSOR_SHAPE_PROTO__DIMENSION__VALUE_DIM_VALUE = 1,
-  ONNX__TENSOR_SHAPE_PROTO__DIMENSION__VALUE_DIM_PARAM = 2,
-} Onnx__TensorShapeProto__Dimension__ValueCase;
-
-struct  _Onnx__TensorShapeProto__Dimension
-{
-  ProtobufCMessage base;
-  /*
-   * Standard denotation can optionally be used to denote tensor
-   * dimensions with standard semantic descriptions to ensure
-   * that operations are applied to the correct axis of a tensor.
-   * Refer to https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md#denotation-definition
-   * for pre-defined dimension denotations.
-   */
-  char *denotation;
-  Onnx__TensorShapeProto__Dimension__ValueCase value_case;
-  //#pragma anon_unions
-  union {
-    int64_t dim_value;
-    /*
-     * namespace Shape
-     */
-    char *dim_param;
-  };
-};
-#define ONNX__TENSOR_SHAPE_PROTO__DIMENSION__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&onnx__tensor_shape_proto__dimension__descriptor) \
-    , NULL, ONNX__TENSOR_SHAPE_PROTO__DIMENSION__VALUE__NOT_SET, {0} }
-
-
-/*
- * Defines a tensor shape. A dimension can be either an integer value
- * or a symbolic variable. A symbolic variable represents an unknown
- * dimension.
- */
-struct  _Onnx__TensorShapeProto
-{
-  ProtobufCMessage base;
-  size_t n_dim;
-  Onnx__TensorShapeProto__Dimension **dim;
-};
-#define ONNX__TENSOR_SHAPE_PROTO__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&onnx__tensor_shape_proto__descriptor) \
-    , 0,NULL }
-
-
-struct  _Onnx__TypeProto__Tensor
-{
-  ProtobufCMessage base;
-  /*
-   * This field MUST NOT have the value of UNDEFINED
-   * This field MUST be present for this version of the IR.
-   */
-  protobuf_c_boolean has_elem_type;
-  Onnx__TensorProto__DataType elem_type;
-  Onnx__TensorShapeProto *shape;
-};
-#define ONNX__TYPE_PROTO__TENSOR__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&onnx__type_proto__tensor__descriptor) \
-    , 0,0, NULL }
-
-
-typedef enum {
-  ONNX__TYPE_PROTO__VALUE__NOT_SET = 0,
-  ONNX__TYPE_PROTO__VALUE_TENSOR_TYPE = 1,
-} Onnx__TypeProto__ValueCase;
-
-/*
- * Types
- * The standard ONNX data types.
- */
-struct  _Onnx__TypeProto
-{
-  ProtobufCMessage base;
-  /*
-   * An optional denotation can be used to denote the whole 
-   * type with a standard semantic description as to what is 
-   * stored inside. Refer to https://github.com/onnx/onnx/blob/master/docs/TypeDenotation.md#type-denotation-definition
-   * for pre-defined type denotations.
-   */
-  char *denotation;
-  Onnx__TypeProto__ValueCase value_case;
-  //#pragma anon_unions
-  union {
-    /*
-     * The type of a tensor.
-     */
-    Onnx__TypeProto__Tensor *tensor_type;
-  };
-};
-#define ONNX__TYPE_PROTO__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&onnx__type_proto__descriptor) \
-    , NULL, ONNX__TYPE_PROTO__VALUE__NOT_SET, {0} }
-
-
-/*
- * Operator Sets
- * OperatorSets are uniquely identified by a (domain, opset_version) pair.
- */
-struct  _Onnx__OperatorSetIdProto
-{
-  ProtobufCMessage base;
-  /*
-   * The domain of the operator set being identified.
-   * The empty string ("") or absence of this field implies the operator
-   * set that is defined as part of the ONNX specification.
-   * This field MUST be present in this version of the IR when referring to any other operator set.
-   */
-  char *domain;
-  /*
-   * The version of the operator set being identified.
-   * This field MUST be present in this version of the IR.
-   */
-  protobuf_c_boolean has_version;
-  int64_t version;
-};
-#define ONNX__OPERATOR_SET_ID_PROTO__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&onnx__operator_set_id_proto__descriptor) \
-    , NULL, 0,0 }
-
-
-/* Onnx__AttributeProto methods */
-void   onnx__attribute_proto__init
-                     (Onnx__AttributeProto         *message);
-size_t onnx__attribute_proto__get_packed_size
-                     (const Onnx__AttributeProto   *message);
-size_t onnx__attribute_proto__pack
-                     (const Onnx__AttributeProto   *message,
-                      uint8_t             *out);
-size_t onnx__attribute_proto__pack_to_buffer
-                     (const Onnx__AttributeProto   *message,
-                      ProtobufCBuffer     *buffer);
-Onnx__AttributeProto *
-       onnx__attribute_proto__unpack
-                     (ProtobufCAllocator  *allocator,
-                      size_t               len,
-                      const uint8_t       *data);
-void   onnx__attribute_proto__free_unpacked
-                     (Onnx__AttributeProto *message,
-                      ProtobufCAllocator *allocator);
-/* Onnx__ValueInfoProto methods */
-void   onnx__value_info_proto__init
-                     (Onnx__ValueInfoProto         *message);
-size_t onnx__value_info_proto__get_packed_size
-                     (const Onnx__ValueInfoProto   *message);
-size_t onnx__value_info_proto__pack
-                     (const Onnx__ValueInfoProto   *message,
-                      uint8_t             *out);
-size_t onnx__value_info_proto__pack_to_buffer
-                     (const Onnx__ValueInfoProto   *message,
-                      ProtobufCBuffer     *buffer);
-Onnx__ValueInfoProto *
-       onnx__value_info_proto__unpack
-                     (ProtobufCAllocator  *allocator,
-                      size_t               len,
-                      const uint8_t       *data);
-void   onnx__value_info_proto__free_unpacked
-                     (Onnx__ValueInfoProto *message,
-                      ProtobufCAllocator *allocator);
-/* Onnx__NodeProto methods */
-void   onnx__node_proto__init
-                     (Onnx__NodeProto         *message);
-size_t onnx__node_proto__get_packed_size
-                     (const Onnx__NodeProto   *message);
-size_t onnx__node_proto__pack
-                     (const Onnx__NodeProto   *message,
-                      uint8_t             *out);
-size_t onnx__node_proto__pack_to_buffer
-                     (const Onnx__NodeProto   *message,
-                      ProtobufCBuffer     *buffer);
-Onnx__NodeProto *
-       onnx__node_proto__unpack
-                     (ProtobufCAllocator  *allocator,
-                      size_t               len,
-                      const uint8_t       *data);
-void   onnx__node_proto__free_unpacked
-                     (Onnx__NodeProto *message,
-                      ProtobufCAllocator *allocator);
-/* Onnx__ModelProto methods */
-void   onnx__model_proto__init
-                     (Onnx__ModelProto         *message);
-size_t onnx__model_proto__get_packed_size
-                     (const Onnx__ModelProto   *message);
-size_t onnx__model_proto__pack
-                     (const Onnx__ModelProto   *message,
-                      uint8_t             *out);
-size_t onnx__model_proto__pack_to_buffer
-                     (const Onnx__ModelProto   *message,
-                      ProtobufCBuffer     *buffer);
-Onnx__ModelProto *
-       onnx__model_proto__unpack
-                     (ProtobufCAllocator  *allocator,
-                      size_t               len,
-                      const uint8_t       *data);
-void   onnx__model_proto__free_unpacked
-                     (Onnx__ModelProto *message,
-                      ProtobufCAllocator *allocator);
-/* Onnx__StringStringEntryProto methods */
-void   onnx__string_string_entry_proto__init
-                     (Onnx__StringStringEntryProto         *message);
-size_t onnx__string_string_entry_proto__get_packed_size
-                     (const Onnx__StringStringEntryProto   *message);
-size_t onnx__string_string_entry_proto__pack
-                     (const Onnx__StringStringEntryProto   *message,
-                      uint8_t             *out);
-size_t onnx__string_string_entry_proto__pack_to_buffer
-                     (const Onnx__StringStringEntryProto   *message,
-                      ProtobufCBuffer     *buffer);
-Onnx__StringStringEntryProto *
-       onnx__string_string_entry_proto__unpack
-                     (ProtobufCAllocator  *allocator,
-                      size_t               len,
-                      const uint8_t       *data);
-void   onnx__string_string_entry_proto__free_unpacked
-                     (Onnx__StringStringEntryProto *message,
-                      ProtobufCAllocator *allocator);
-/* Onnx__GraphProto methods */
-void   onnx__graph_proto__init
-                     (Onnx__GraphProto         *message);
-size_t onnx__graph_proto__get_packed_size
-                     (const Onnx__GraphProto   *message);
-size_t onnx__graph_proto__pack
-                     (const Onnx__GraphProto   *message,
-                      uint8_t             *out);
-size_t onnx__graph_proto__pack_to_buffer
-                     (const Onnx__GraphProto   *message,
-                      ProtobufCBuffer     *buffer);
-Onnx__GraphProto *
-       onnx__graph_proto__unpack
-                     (ProtobufCAllocator  *allocator,
-                      size_t               len,
-                      const uint8_t       *data);
-void   onnx__graph_proto__free_unpacked
-                     (Onnx__GraphProto *message,
-                      ProtobufCAllocator *allocator);
-/* Onnx__TensorProto__Segment methods */
-void   onnx__tensor_proto__segment__init
-                     (Onnx__TensorProto__Segment         *message);
-/* Onnx__TensorProto methods */
-void   onnx__tensor_proto__init
-                     (Onnx__TensorProto         *message);
-size_t onnx__tensor_proto__get_packed_size
-                     (const Onnx__TensorProto   *message);
-size_t onnx__tensor_proto__pack
-                     (const Onnx__TensorProto   *message,
-                      uint8_t             *out);
-size_t onnx__tensor_proto__pack_to_buffer
-                     (const Onnx__TensorProto   *message,
-                      ProtobufCBuffer     *buffer);
-Onnx__TensorProto *
-       onnx__tensor_proto__unpack
-                     (ProtobufCAllocator  *allocator,
-                      size_t               len,
-                      const uint8_t       *data);
-void   onnx__tensor_proto__free_unpacked
-                     (Onnx__TensorProto *message,
-                      ProtobufCAllocator *allocator);
-/* Onnx__TensorShapeProto__Dimension methods */
-void   onnx__tensor_shape_proto__dimension__init
-                     (Onnx__TensorShapeProto__Dimension         *message);
-/* Onnx__TensorShapeProto methods */
-void   onnx__tensor_shape_proto__init
-                     (Onnx__TensorShapeProto         *message);
-size_t onnx__tensor_shape_proto__get_packed_size
-                     (const Onnx__TensorShapeProto   *message);
-size_t onnx__tensor_shape_proto__pack
-                     (const Onnx__TensorShapeProto   *message,
-                      uint8_t             *out);
-size_t onnx__tensor_shape_proto__pack_to_buffer
-                     (const Onnx__TensorShapeProto   *message,
-                      ProtobufCBuffer     *buffer);
-Onnx__TensorShapeProto *
-       onnx__tensor_shape_proto__unpack
-                     (ProtobufCAllocator  *allocator,
-                      size_t               len,
-                      const uint8_t       *data);
-void   onnx__tensor_shape_proto__free_unpacked
-                     (Onnx__TensorShapeProto *message,
-                      ProtobufCAllocator *allocator);
-/* Onnx__TypeProto__Tensor methods */
-void   onnx__type_proto__tensor__init
-                     (Onnx__TypeProto__Tensor         *message);
-/* Onnx__TypeProto methods */
-void   onnx__type_proto__init
-                     (Onnx__TypeProto         *message);
-size_t onnx__type_proto__get_packed_size
-                     (const Onnx__TypeProto   *message);
-size_t onnx__type_proto__pack
-                     (const Onnx__TypeProto   *message,
-                      uint8_t             *out);
-size_t onnx__type_proto__pack_to_buffer
-                     (const Onnx__TypeProto   *message,
-                      ProtobufCBuffer     *buffer);
-Onnx__TypeProto *
-       onnx__type_proto__unpack
-                     (ProtobufCAllocator  *allocator,
-                      size_t               len,
-                      const uint8_t       *data);
-void   onnx__type_proto__free_unpacked
-                     (Onnx__TypeProto *message,
-                      ProtobufCAllocator *allocator);
-/* Onnx__OperatorSetIdProto methods */
-void   onnx__operator_set_id_proto__init
-                     (Onnx__OperatorSetIdProto         *message);
-size_t onnx__operator_set_id_proto__get_packed_size
-                     (const Onnx__OperatorSetIdProto   *message);
-size_t onnx__operator_set_id_proto__pack
-                     (const Onnx__OperatorSetIdProto   *message,
-                      uint8_t             *out);
-size_t onnx__operator_set_id_proto__pack_to_buffer
-                     (const Onnx__OperatorSetIdProto   *message,
-                      ProtobufCBuffer     *buffer);
-Onnx__OperatorSetIdProto *
-       onnx__operator_set_id_proto__unpack
-                     (ProtobufCAllocator  *allocator,
-                      size_t               len,
-                      const uint8_t       *data);
-void   onnx__operator_set_id_proto__free_unpacked
-                     (Onnx__OperatorSetIdProto *message,
-                      ProtobufCAllocator *allocator);
-/* --- per-message closures --- */
-
-typedef void (*Onnx__AttributeProto_Closure)
-                 (const Onnx__AttributeProto *message,
-                  void *closure_data);
-typedef void (*Onnx__ValueInfoProto_Closure)
-                 (const Onnx__ValueInfoProto *message,
-                  void *closure_data);
-typedef void (*Onnx__NodeProto_Closure)
-                 (const Onnx__NodeProto *message,
-                  void *closure_data);
-typedef void (*Onnx__ModelProto_Closure)
-                 (const Onnx__ModelProto *message,
-                  void *closure_data);
-typedef void (*Onnx__StringStringEntryProto_Closure)
-                 (const Onnx__StringStringEntryProto *message,
-                  void *closure_data);
-typedef void (*Onnx__GraphProto_Closure)
-                 (const Onnx__GraphProto *message,
-                  void *closure_data);
-typedef void (*Onnx__TensorProto__Segment_Closure)
-                 (const Onnx__TensorProto__Segment *message,
-                  void *closure_data);
-typedef void (*Onnx__TensorProto_Closure)
-                 (const Onnx__TensorProto *message,
-                  void *closure_data);
-typedef void (*Onnx__TensorShapeProto__Dimension_Closure)
-                 (const Onnx__TensorShapeProto__Dimension *message,
-                  void *closure_data);
-typedef void (*Onnx__TensorShapeProto_Closure)
-                 (const Onnx__TensorShapeProto *message,
-                  void *closure_data);
-typedef void (*Onnx__TypeProto__Tensor_Closure)
-                 (const Onnx__TypeProto__Tensor *message,
-                  void *closure_data);
-typedef void (*Onnx__TypeProto_Closure)
-                 (const Onnx__TypeProto *message,
-                  void *closure_data);
-typedef void (*Onnx__OperatorSetIdProto_Closure)
-                 (const Onnx__OperatorSetIdProto *message,
-                  void *closure_data);
-
-/* --- services --- */
-
-
-/* --- descriptors --- */
-
-extern const ProtobufCEnumDescriptor    onnx__version__descriptor;
-extern const ProtobufCMessageDescriptor onnx__attribute_proto__descriptor;
-extern const ProtobufCEnumDescriptor    onnx__attribute_proto__attribute_type__descriptor;
-extern const ProtobufCMessageDescriptor onnx__value_info_proto__descriptor;
-extern const ProtobufCMessageDescriptor onnx__node_proto__descriptor;
-extern const ProtobufCMessageDescriptor onnx__model_proto__descriptor;
-extern const ProtobufCMessageDescriptor onnx__string_string_entry_proto__descriptor;
-extern const ProtobufCMessageDescriptor onnx__graph_proto__descriptor;
-extern const ProtobufCMessageDescriptor onnx__tensor_proto__descriptor;
-extern const ProtobufCMessageDescriptor onnx__tensor_proto__segment__descriptor;
-extern const ProtobufCEnumDescriptor    onnx__tensor_proto__data_type__descriptor;
-extern const ProtobufCMessageDescriptor onnx__tensor_shape_proto__descriptor;
-extern const ProtobufCMessageDescriptor onnx__tensor_shape_proto__dimension__descriptor;
-extern const ProtobufCMessageDescriptor onnx__type_proto__descriptor;
-extern const ProtobufCMessageDescriptor onnx__type_proto__tensor__descriptor;
-extern const ProtobufCMessageDescriptor onnx__operator_set_id_proto__descriptor;
-
-PROTOBUF_C__END_DECLS
-
-
-#endif  /* PROTOBUF_C_src_2fonnx_2eproto__INCLUDED */
--- a/components/ai/onnx/protobuf/protobuf-c.c
+++ b/components/ai/onnx/protobuf/protobuf-c.c
--- a/components/ai/onnx/protobuf/protobuf-c.h
+++ b/components/ai/onnx/protobuf/protobuf-c.h