From e356fc9c4a7b728b536e2881020503f9bdd703eb Mon Sep 17 00:00:00 2001
From: husein zolkepli <husein.zol05@gmail.com>
Date: Thu, 12 Nov 2020 16:17:46 +0800
Subject: [PATCH] added quantization notebooks

---
 .../quantize-constituency-model.ipynb         | 471 +++++++++++++
 .../quantize-dependency-model.ipynb           | 354 ++++++++++
 .../quantization/quantize-emotion-model.ipynb | 385 +++++++++++
 .../quantization/quantize-entity-model.ipynb  | 413 ++++++++++++
 ...uantize-paraphrase-model-transformer.ipynb | 271 ++++++++
 .../quantize-paraphrase-model.ipynb           | 358 ++++++++++
 session/quantization/quantize-pos-model.ipynb | 413 ++++++++++++
 .../quantize-relevancy-model.ipynb            | 397 +++++++++++
 .../quantize-sentiment-model.ipynb            | 409 ++++++++++++
 .../quantize-similarity-model.ipynb           | 623 ++++++++++++++++++
 .../quantization/quantize-stem-model.ipynb    | 197 ++++++
 .../quantize-subjectivity-model.ipynb         | 394 +++++++++++
 ...zation-abstractive-model-transformer.ipynb | 421 ++++++++++++
 ...tize-summarization-abstractive-model.ipynb | 386 +++++++++++
 .../quantize-toxicity-model.ipynb             | 394 +++++++++++
 .../quantize-true-case-model.ipynb            | 265 ++++++++
 16 files changed, 6151 insertions(+)
 create mode 100644 session/quantization/quantize-constituency-model.ipynb
 create mode 100644 session/quantization/quantize-dependency-model.ipynb
 create mode 100644 session/quantization/quantize-emotion-model.ipynb
 create mode 100644 session/quantization/quantize-entity-model.ipynb
 create mode 100644 session/quantization/quantize-paraphrase-model-transformer.ipynb
 create mode 100644 session/quantization/quantize-paraphrase-model.ipynb
 create mode 100644 session/quantization/quantize-pos-model.ipynb
 create mode 100644 session/quantization/quantize-relevancy-model.ipynb
 create mode 100644 session/quantization/quantize-sentiment-model.ipynb
 create mode 100644 session/quantization/quantize-similarity-model.ipynb
 create mode 100644 session/quantization/quantize-stem-model.ipynb
 create mode 100644 session/quantization/quantize-subjectivity-model.ipynb
 create mode 100644 session/quantization/quantize-summarization-abstractive-model-transformer.ipynb
 create mode 100644 session/quantization/quantize-summarization-abstractive-model.ipynb
 create mode 100644 session/quantization/quantize-toxicity-model.ipynb
 create mode 100644 session/quantization/quantize-true-case-model.ipynb

diff --git a/session/quantization/quantize-constituency-model.ipynb b/session/quantization/quantize-constituency-model.ipynb
new file mode 100644
index 00000000..fd4a3cd2
--- /dev/null
+++ b/session/quantization/quantize-constituency-model.ipynb
@@ -0,0 +1,471 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "S3_PATH_CONSTITUENCY = {\n",
+    "    'bert': {\n",
+    "        'model': 'v38/constituency/bert-base.pb',\n",
+    "        'dictionary': 'v38/constituency/vocab-bert-base.json',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.bert.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.bert.model',\n",
+    "    },\n",
+    "    'tiny-bert': {\n",
+    "        'model': 'v38/constituency/tiny-bert.pb',\n",
+    "        'dictionary': 'v38/constituency/vocab-tiny-bert.json',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.bert.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.bert.model',\n",
+    "    },\n",
+    "#     'albert': {\n",
+    "#         'model': 'v38/constituency/albert-base.pb',\n",
+    "#         'dictionary': 'v38/constituency/vocab-albert-base.json',\n",
+    "#         'vocab': 'tokenizer/sp10m.cased.v10.vocab',\n",
+    "#         'tokenizer': 'tokenizer/sp10m.cased.v10.model',\n",
+    "#     },\n",
+    "#     'tiny-albert': {\n",
+    "#         'model': 'v38/constituency/albert-tiny.pb',\n",
+    "#         'dictionary': 'v38/constituency/vocab-albert-tiny.json',\n",
+    "#         'vocab': 'tokenizer/sp10m.cased.v10.vocab',\n",
+    "#         'tokenizer': 'tokenizer/sp10m.cased.v10.model',\n",
+    "#     },\n",
+    "#     'xlnet': {\n",
+    "#         'model': 'v40/constituency/xlnet-base.pb',\n",
+    "#         'quantized': 'v40/constituency/xlnet-base.pb.quantized',\n",
+    "#         'dictionary': 'v40/constituency/vocab-xlnet-base.json',\n",
+    "#         'vocab': 'tokenizer/sp10m.cased.v9.vocab',\n",
+    "#         'tokenizer': 'tokenizer/sp10m.cased.v9.model',\n",
+    "#     },\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bert\n",
+      "tiny-bert\n"
+     ]
+    }
+   ],
+   "source": [
+    "for k in S3_PATH_CONSTITUENCY.keys():\n",
+    "    if k not in ['multinomial']:\n",
+    "        print(k)\n",
+    "        os.system(f\"wget https://f000.backblazeb2.com/file/malaya-model/{S3_PATH_CONSTITUENCY[k]['model']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.tools.graph_transforms import TransformGraph\n",
+    "from glob import glob\n",
+    "tf.set_random_seed(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['tiny-bert.pb',\n",
+       " 'albert-tiny-similarity.pb',\n",
+       " 'bert-base.pb',\n",
+       " 'xlnet-base-similarity.pb',\n",
+       " 'albert-base-similarity.pb',\n",
+       " 'bert-base-similarity.pb',\n",
+       " 'alxlnet-base-similarity.pb',\n",
+       " 'tiny-bert-similarity.pb']"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pbs = glob('*.pb')\n",
+    "pbs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[(<tf.Tensor 'import/bert/encoder/layer_8/attention/output/LayerNorm/batchnorm/mul_2:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_8/attention/output/LayerNorm/batchnorm/sub:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_8/attention/output/LayerNorm/batchnorm/add_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_8/intermediate/einsum/Reshape:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_8/output/einsum/MatMul:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_8/output/einsum/Reshape_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_8/output/add:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_8/output/add_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_8/output/LayerNorm/moments/SquaredDifference:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_8/output/LayerNorm/batchnorm/mul:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_8/output/LayerNorm/batchnorm/mul_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_8/output/LayerNorm/batchnorm/mul_2:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_8/output/LayerNorm/batchnorm/sub:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_8/output/LayerNorm/batchnorm/add_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/attention/self/query/einsum/Reshape:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/attention/self/query/einsum/MatMul:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/attention/self/key/einsum/Reshape:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/attention/self/key/einsum/MatMul:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/attention/self/value/einsum/Reshape:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/attention/self/value/einsum/MatMul:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/attention/output/einsum/Reshape:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/attention/output/einsum/MatMul:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/attention/output/einsum/Reshape_2:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/attention/output/add:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/attention/output/add_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/attention/output/LayerNorm/moments/SquaredDifference:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/attention/output/LayerNorm/batchnorm/mul:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/attention/output/LayerNorm/batchnorm/mul_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/attention/output/LayerNorm/batchnorm/mul_2:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/attention/output/LayerNorm/batchnorm/sub:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/attention/output/LayerNorm/batchnorm/add_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/intermediate/einsum/Reshape:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/output/einsum/MatMul:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/output/einsum/Reshape_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/output/add:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/output/add_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/output/LayerNorm/moments/SquaredDifference:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/output/LayerNorm/batchnorm/mul:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/output/LayerNorm/batchnorm/mul_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/output/LayerNorm/batchnorm/mul_2:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/output/LayerNorm/batchnorm/sub:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_9/output/LayerNorm/batchnorm/add_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/attention/self/query/einsum/Reshape:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/attention/self/query/einsum/MatMul:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/attention/self/key/einsum/Reshape:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/attention/self/key/einsum/MatMul:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/attention/self/value/einsum/Reshape:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/attention/self/value/einsum/MatMul:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/attention/output/einsum/Reshape:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/attention/output/einsum/MatMul:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/attention/output/einsum/Reshape_2:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/attention/output/add:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/attention/output/add_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/attention/output/LayerNorm/moments/SquaredDifference:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/attention/output/LayerNorm/batchnorm/mul:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/attention/output/LayerNorm/batchnorm/mul_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/attention/output/LayerNorm/batchnorm/mul_2:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/attention/output/LayerNorm/batchnorm/sub:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/attention/output/LayerNorm/batchnorm/add_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/intermediate/einsum/Reshape:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/output/einsum/MatMul:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/output/einsum/Reshape_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/output/add:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/output/add_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/output/LayerNorm/moments/SquaredDifference:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/output/LayerNorm/batchnorm/mul:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/output/LayerNorm/batchnorm/mul_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/output/LayerNorm/batchnorm/mul_2:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/output/LayerNorm/batchnorm/sub:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_10/output/LayerNorm/batchnorm/add_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/attention/self/query/einsum/Reshape:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/attention/self/query/einsum/MatMul:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/attention/self/key/einsum/Reshape:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/attention/self/key/einsum/MatMul:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/attention/self/value/einsum/Reshape:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/attention/self/value/einsum/MatMul:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/attention/output/einsum/Reshape:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/attention/output/einsum/MatMul:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/attention/output/einsum/Reshape_2:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/attention/output/add:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/attention/output/add_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/attention/output/LayerNorm/moments/SquaredDifference:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/attention/output/LayerNorm/batchnorm/mul:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/attention/output/LayerNorm/batchnorm/mul_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/attention/output/LayerNorm/batchnorm/mul_2:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/attention/output/LayerNorm/batchnorm/sub:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/attention/output/LayerNorm/batchnorm/add_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/intermediate/einsum/Reshape:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/output/einsum/MatMul:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/output/einsum/Reshape_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/output/add:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/output/add_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/output/LayerNorm/moments/SquaredDifference:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/output/LayerNorm/batchnorm/mul:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/output/LayerNorm/batchnorm/mul_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/output/LayerNorm/batchnorm/mul_2:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/output/LayerNorm/batchnorm/sub:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/bert/encoder/layer_11/output/LayerNorm/batchnorm/add_1:0' shape=(?, ?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/Reshape:0' shape=(?, 312) dtype=float32>,),\n",
+       " (<tf.Tensor 'import/GatherV2:0' shape=(?, 312) dtype=float32>,)]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "with tf.gfile.GFile('tiny-bert.pb', \"rb\") as f:\n",
+    "    graph_def = tf.GraphDef()\n",
+    "    graph_def.ParseFromString(f.read())\n",
+    "\n",
+    "with tf.Graph().as_default() as graph:\n",
+    "    tf.import_graph_def(graph_def)\n",
+    "\n",
+    "op = graph.get_operations()\n",
+    "x = []\n",
+    "for i in op:\n",
+    "    try:\n",
+    "        if i.values()[0].shape[-1] == 312:\n",
+    "        #if 'batchnorm/add_1' in i.values()[0].name:\n",
+    "            x.append(i.values())\n",
+    "    except:\n",
+    "        pass\n",
+    "    \n",
+    "x[-100:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mapping = {'albert-base.pb': 'import/bert/encoder/transformer/group_0_11/layer_11/inner_group_0/LayerNorm_1/batchnorm/add_1:0',\n",
+    "          'albert-tiny.pb': 'import/bert/encoder/transformer/group_0_3/layer_3/inner_group_0/LayerNorm_1/batchnorm/add_1:0',\n",
+    "          'bert-base.pb': 'import/bert/encoder/layer_11/output/LayerNorm/batchnorm/add_1:0',\n",
+    "          'tiny-bert.pb': 'import/bert/encoder/layer_11/output/LayerNorm/batchnorm/add_1:0',\n",
+    "          'xlnet-base.pb': 'import/model/transformer/layer_11/ff/LayerNorm/batchnorm/add_1:0'}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'bert/encoder/transformer/group_0_11/layer_11/inner_group_0/LayerNorm_1/batchnorm/add_1'"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mapping[pbs[0]].replace('import/','').replace(':0','')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "transforms = ['add_default_attributes',\n",
+    "             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n",
+    "             'fold_batch_norms',\n",
+    "             'fold_old_batch_norms',\n",
+    "             'quantize_weights(fallback_min=-10, fallback_max=10)',\n",
+    "             'strip_unused_nodes',\n",
+    "             'sort_by_execution_order']\n",
+    "\n",
+    "for pb in pbs:\n",
+    "    input_graph_def = tf.GraphDef()\n",
+    "    with tf.gfile.FastGFile(pb, 'rb') as f:\n",
+    "        input_graph_def.ParseFromString(f.read())\n",
+    "        \n",
+    "    a = [mapping[pb].replace('import/','').replace(':0','')]\n",
+    "    \n",
+    "    transformed_graph_def = TransformGraph(input_graph_def, \n",
+    "                                           ['input_ids', 'word_end_mask'],\n",
+    "                                           ['charts', 'tags'] + a, transforms)\n",
+    "    \n",
+    "    with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n",
+    "        f.write(transformed_graph_def.SerializeToString())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_graph(frozen_graph_filename, **kwargs):\n",
+    "    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:\n",
+    "        graph_def = tf.GraphDef()\n",
+    "        graph_def.ParseFromString(f.read())\n",
+    "\n",
+    "    # https://github.com/onnx/tensorflow-onnx/issues/77#issuecomment-445066091\n",
+    "    # to fix import T5\n",
+    "    for node in graph_def.node:\n",
+    "        if node.op == 'RefSwitch':\n",
+    "            node.op = 'Switch'\n",
+    "            for index in xrange(len(node.input)):\n",
+    "                if 'moving_' in node.input[index]:\n",
+    "                    node.input[index] = node.input[index] + '/read'\n",
+    "        elif node.op == 'AssignSub':\n",
+    "            node.op = 'Sub'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "        elif node.op == 'AssignAdd':\n",
+    "            node.op = 'Add'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "        elif node.op == 'Assign':\n",
+    "            node.op = 'Identity'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "            if 'validate_shape' in node.attr:\n",
+    "                del node.attr['validate_shape']\n",
+    "            if len(node.input) == 2:\n",
+    "                node.input[0] = node.input[1]\n",
+    "                del node.input[1]\n",
+    "\n",
+    "    with tf.Graph().as_default() as graph:\n",
+    "        tf.import_graph_def(graph_def)\n",
+    "    return graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['albert-base.pb.quantized',\n",
+       " 'xlnet-base.pb.quantized',\n",
+       " 'albert-tiny.pb.quantized',\n",
+       " 'tiny-bert.pb.quantized',\n",
+       " 'bert-base.pb.quantized']"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "quantized = glob('*.pb.quantized')\n",
+    "quantized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(\n",
+    "#     graph_def_file='test.pb',\n",
+    "#     input_arrays=['Placeholder', 'Placeholder_1'],\n",
+    "#     input_shapes={'Placeholder' : [None, 512], 'Placeholder_1': [None, 512]},\n",
+    "#     output_arrays=['logits'],\n",
+    "# )\n",
+    "# # converter.allow_custom_ops=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# converter.experimental_new_converter = True\n",
+    "# tflite_model = converter.convert()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-float16.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-hybrid.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# interpreter = tf.lite.Interpreter(model_path='tiny-bert-sentiment-hybrid.tflite')\n",
+    "# interpreter.allocate_tensors()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/session/quantization/quantize-dependency-model.ipynb b/session/quantization/quantize-dependency-model.ipynb
new file mode 100644
index 00000000..cd3576dc
--- /dev/null
+++ b/session/quantization/quantize-dependency-model.ipynb
@@ -0,0 +1,354 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "S3_PATH_DEPENDENCY = {\n",
+    "    'bert': {\n",
+    "        'model': 'v34/dependency/bert-base-dependency.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.bert.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.bert.model',\n",
+    "    },\n",
+    "    'tiny-bert': {\n",
+    "        'model': 'v34/dependency/tiny-bert-dependency.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.bert.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.bert.model',\n",
+    "    },\n",
+    "    'albert': {\n",
+    "        'model': 'v34/dependency/albert-base-dependency.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v10.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v10.model',\n",
+    "    },\n",
+    "    'tiny-albert': {\n",
+    "        'model': 'v34/dependency/albert-tiny-dependency.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v10.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v10.model',\n",
+    "    },\n",
+    "    'xlnet': {\n",
+    "        'model': 'v34/dependency/xlnet-base-dependency.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v9.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v9.model',\n",
+    "    },\n",
+    "    'alxlnet': {\n",
+    "        'model': 'v34/dependency/alxlnet-base-dependency.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v9.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v9.model',\n",
+    "    },\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bert\n",
+      "tiny-bert\n",
+      "albert\n",
+      "tiny-albert\n",
+      "xlnet\n",
+      "alxlnet\n"
+     ]
+    }
+   ],
+   "source": [
+    "for k in S3_PATH_DEPENDENCY.keys():\n",
+    "    if k not in ['multinomial']:\n",
+    "        print(k)\n",
+    "        os.system(f\"wget https://f000.backblazeb2.com/file/malaya-model/{S3_PATH_DEPENDENCY[k]['model']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.tools.graph_transforms import TransformGraph\n",
+    "from glob import glob\n",
+    "tf.set_random_seed(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['alxlnet-base-dependency.pb',\n",
+       " 'albert-tiny-dependency.pb',\n",
+       " 'albert-base-dependency.pb',\n",
+       " 'bert-base-dependency.pb',\n",
+       " 'xlnet-base-dependency.pb',\n",
+       " 'tiny-bert-dependency.pb']"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pbs = glob('*.pb')\n",
+    "pbs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# with tf.gfile.GFile('xlnet-base.pb', \"rb\") as f:\n",
+    "#     graph_def = tf.GraphDef()\n",
+    "#     graph_def.ParseFromString(f.read())\n",
+    "\n",
+    "# with tf.Graph().as_default() as graph:\n",
+    "#     tf.import_graph_def(graph_def)\n",
+    "\n",
+    "# op = graph.get_operations()\n",
+    "# x = []\n",
+    "# for i in op:\n",
+    "#     try:\n",
+    "#         #if i.values()[0].shape[-1] == 768:\n",
+    "#         if 'batchnorm/add_1' in i.values()[0].name:\n",
+    "#             x.append(i.values())\n",
+    "#     except:\n",
+    "#         pass\n",
+    "    \n",
+    "# x[-100:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From <ipython-input-8-69933fee3918>:11: FastGFile.__init__ (from tensorflow.python.platform.gfile) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "Use tf.gfile.GFile.\n"
+     ]
+    }
+   ],
+   "source": [
+    "transforms = ['add_default_attributes',\n",
+    "             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n",
+    "             'fold_batch_norms',\n",
+    "             'fold_old_batch_norms',\n",
+    "             'quantize_weights(fallback_min=-10, fallback_max=10)',\n",
+    "             'strip_unused_nodes',\n",
+    "             'sort_by_execution_order']\n",
+    "\n",
+    "for pb in pbs:\n",
+    "    input_graph_def = tf.GraphDef()\n",
+    "    with tf.gfile.FastGFile(pb, 'rb') as f:\n",
+    "        input_graph_def.ParseFromString(f.read())\n",
+    "        \n",
+    "    if 'bert' in pb:\n",
+    "        inputs = ['Placeholder']\n",
+    "        a = ['dense/BiasAdd']\n",
+    "    if 'xlnet' in pb:\n",
+    "        inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+    "        a = ['transpose_3']\n",
+    "    \n",
+    "    transformed_graph_def = TransformGraph(input_graph_def, \n",
+    "                                           inputs,\n",
+    "                                           ['logits', 'heads_seq'] + a, transforms)\n",
+    "    \n",
+    "    with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n",
+    "        f.write(transformed_graph_def.SerializeToString())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_graph(frozen_graph_filename, **kwargs):\n",
+    "    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:\n",
+    "        graph_def = tf.GraphDef()\n",
+    "        graph_def.ParseFromString(f.read())\n",
+    "\n",
+    "    # https://github.com/onnx/tensorflow-onnx/issues/77#issuecomment-445066091\n",
+    "    # to fix import T5\n",
+    "    for node in graph_def.node:\n",
+    "        if node.op == 'RefSwitch':\n",
+    "            node.op = 'Switch'\n",
+    "            for index in xrange(len(node.input)):\n",
+    "                if 'moving_' in node.input[index]:\n",
+    "                    node.input[index] = node.input[index] + '/read'\n",
+    "        elif node.op == 'AssignSub':\n",
+    "            node.op = 'Sub'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "        elif node.op == 'AssignAdd':\n",
+    "            node.op = 'Add'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "        elif node.op == 'Assign':\n",
+    "            node.op = 'Identity'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "            if 'validate_shape' in node.attr:\n",
+    "                del node.attr['validate_shape']\n",
+    "            if len(node.input) == 2:\n",
+    "                node.input[0] = node.input[1]\n",
+    "                del node.input[1]\n",
+    "\n",
+    "    with tf.Graph().as_default() as graph:\n",
+    "        tf.import_graph_def(graph_def)\n",
+    "    return graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['albert-base-dependency.pb.quantized',\n",
+       " 'albert-tiny-dependency.pb.quantized',\n",
+       " 'xlnet-base-dependency.pb.quantized',\n",
+       " 'bert-base-dependency.pb.quantized',\n",
+       " 'tiny-bert-dependency.pb.quantized',\n",
+       " 'alxlnet-base-dependency.pb.quantized']"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "quantized = glob('*.pb.quantized')\n",
+    "quantized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm *.pb*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(\n",
+    "#     graph_def_file='test.pb',\n",
+    "#     input_arrays=['Placeholder', 'Placeholder_1'],\n",
+    "#     input_shapes={'Placeholder' : [None, 512], 'Placeholder_1': [None, 512]},\n",
+    "#     output_arrays=['logits'],\n",
+    "# )\n",
+    "# # converter.allow_custom_ops=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# converter.experimental_new_converter = True\n",
+    "# tflite_model = converter.convert()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-float16.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-hybrid.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# interpreter = tf.lite.Interpreter(model_path='tiny-bert-sentiment-hybrid.tflite')\n",
+    "# interpreter.allocate_tensors()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/session/quantization/quantize-emotion-model.ipynb b/session/quantization/quantize-emotion-model.ipynb
new file mode 100644
index 00000000..14686d0b
--- /dev/null
+++ b/session/quantization/quantize-emotion-model.ipynb
@@ -0,0 +1,385 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "S3_PATH_EMOTION = {\n",
+    "    'multinomial': {\n",
+    "        'model': 'v34/emotion/multinomial.pkl',\n",
+    "        'vector': 'v34/emotion/tfidf.pkl',\n",
+    "        'bpe': 'v34/emotion/bpe.model',\n",
+    "    },\n",
+    "    'bert': {\n",
+    "        'model': 'v34/emotion/bert-base-emotion.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.bert.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.bert.model',\n",
+    "    },\n",
+    "    'tiny-bert': {\n",
+    "        'model': 'v34/emotion/tiny-bert-emotion.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.bert.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.bert.model',\n",
+    "    },\n",
+    "    'albert': {\n",
+    "        'model': 'v34/emotion/albert-base-emotion.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v10.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v10.model',\n",
+    "    },\n",
+    "    'tiny-albert': {\n",
+    "        'model': 'v34/emotion/albert-tiny-emotion.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v10.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v10.model',\n",
+    "    },\n",
+    "    'xlnet': {\n",
+    "        'model': 'v34/emotion/xlnet-base-emotion.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v9.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v9.model',\n",
+    "    },\n",
+    "    'alxlnet': {\n",
+    "        'model': 'v34/emotion/alxlnet-base-emotion.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v9.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v9.model',\n",
+    "    },\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bert\n",
+      "tiny-bert\n",
+      "albert\n",
+      "tiny-albert\n",
+      "xlnet\n",
+      "alxlnet\n"
+     ]
+    }
+   ],
+   "source": [
+    "for k in S3_PATH_EMOTION.keys():\n",
+    "    if k != 'multinomial':\n",
+    "        print(k)\n",
+    "        os.system(f\"wget https://f000.backblazeb2.com/file/malaya-model/{S3_PATH_EMOTION[k]['model']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.tools.graph_transforms import TransformGraph\n",
+    "from glob import glob\n",
+    "tf.set_random_seed(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['bert-base-emotion.pb',\n",
+       " 'xlnet-base-emotion.pb',\n",
+       " 'alxlnet-base-emotion.pb',\n",
+       " 'albert-base-emotion.pb',\n",
+       " 'tiny-bert-emotion.pb',\n",
+       " 'albert-tiny-emotion.pb']"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pbs = glob('*.pb')\n",
+    "pbs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From <ipython-input-6-969a8c0cffd2>:11: FastGFile.__init__ (from tensorflow.python.platform.gfile) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "Use tf.gfile.GFile.\n",
+      "bert-base-emotion.pb ['Placeholder', 'Placeholder_1']\n",
+      "xlnet-base-emotion.pb ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+      "alxlnet-base-emotion.pb ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+      "albert-base-emotion.pb ['Placeholder', 'Placeholder_1']\n",
+      "tiny-bert-emotion.pb ['Placeholder', 'Placeholder_1']\n",
+      "albert-tiny-emotion.pb ['Placeholder', 'Placeholder_1']\n"
+     ]
+    }
+   ],
+   "source": [
+    "transforms = ['add_default_attributes',\n",
+    "             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n",
+    "             'fold_batch_norms',\n",
+    "             'fold_old_batch_norms',\n",
+    "             'quantize_weights(fallback_min=-10, fallback_max=10)',\n",
+    "             'strip_unused_nodes',\n",
+    "             'sort_by_execution_order']\n",
+    "\n",
+    "for pb in pbs:\n",
+    "    input_graph_def = tf.GraphDef()\n",
+    "    with tf.gfile.FastGFile(pb, 'rb') as f:\n",
+    "        input_graph_def.ParseFromString(f.read())\n",
+    "    \n",
+    "    if 'bert' in pb:\n",
+    "        inputs = ['Placeholder', 'Placeholder_1']\n",
+    "        outputs = ['dense/BiasAdd']\n",
+    "        \n",
+    "    if 'xlnet'in pb:\n",
+    "        inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+    "        outputs = ['transpose_3']\n",
+    "        \n",
+    "    print(pb, inputs)\n",
+    "    \n",
+    "    transformed_graph_def = TransformGraph(input_graph_def, \n",
+    "                                           inputs,\n",
+    "                                           ['logits', 'logits_seq'] + outputs, transforms)\n",
+    "    \n",
+    "    with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n",
+    "        f.write(transformed_graph_def.SerializeToString())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# def load_graph(frozen_graph_filename, **kwargs):\n",
+    "#     with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:\n",
+    "#         graph_def = tf.GraphDef()\n",
+    "#         graph_def.ParseFromString(f.read())\n",
+    "\n",
+    "#     # https://github.com/onnx/tensorflow-onnx/issues/77#issuecomment-445066091\n",
+    "#     # to fix import T5\n",
+    "#     for node in graph_def.node:\n",
+    "#         if node.op == 'RefSwitch':\n",
+    "#             node.op = 'Switch'\n",
+    "#             for index in xrange(len(node.input)):\n",
+    "#                 if 'moving_' in node.input[index]:\n",
+    "#                     node.input[index] = node.input[index] + '/read'\n",
+    "#         elif node.op == 'AssignSub':\n",
+    "#             node.op = 'Sub'\n",
+    "#             if 'use_locking' in node.attr:\n",
+    "#                 del node.attr['use_locking']\n",
+    "#         elif node.op == 'AssignAdd':\n",
+    "#             node.op = 'Add'\n",
+    "#             if 'use_locking' in node.attr:\n",
+    "#                 del node.attr['use_locking']\n",
+    "#         elif node.op == 'Assign':\n",
+    "#             node.op = 'Identity'\n",
+    "#             if 'use_locking' in node.attr:\n",
+    "#                 del node.attr['use_locking']\n",
+    "#             if 'validate_shape' in node.attr:\n",
+    "#                 del node.attr['validate_shape']\n",
+    "#             if len(node.input) == 2:\n",
+    "#                 node.input[0] = node.input[1]\n",
+    "#                 del node.input[1]\n",
+    "\n",
+    "#     with tf.Graph().as_default() as graph:\n",
+    "#         tf.import_graph_def(graph_def)\n",
+    "#     return graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# g = load_graph('xlnet-base-emotion.pb.quantized')\n",
+    "# x = g.get_tensor_by_name('import/Placeholder:0')\n",
+    "# x_len = g.get_tensor_by_name('import/Placeholder_1:0')\n",
+    "# x_len2 = g.get_tensor_by_name('import/Placeholder_2:0')\n",
+    "# logits = g.get_tensor_by_name('import/logits:0')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# x, x_len, logits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# test_sess = tf.InteractiveSession(graph = g)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%time\n",
+    "# test_sess.run(logits, feed_dict = {x: [[1,2,3,3,4]], x_len: [[1,1,1,1,1]],\n",
+    "#                                   x_len2: [[1,1,1,1,1]]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%time\n",
+    "# test_sess.run(logits, feed_dict = {x: [[1,2,3,3,4]], x_len: [[1,1,1,1,1]]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['bert-base-emotion.pb.quantized',\n",
+       " 'albert-base-emotion.pb.quantized',\n",
+       " 'xlnet-base-emotion.pb.quantized',\n",
+       " 'tiny-bert-emotion.pb.quantized',\n",
+       " 'alxlnet-base-emotion.pb.quantized',\n",
+       " 'albert-tiny-emotion.pb.quantized']"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "quantized = glob('*.pb.quantized')\n",
+    "quantized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(\n",
+    "#     graph_def_file='test.pb',\n",
+    "#     input_arrays=['Placeholder', 'Placeholder_1'],\n",
+    "#     input_shapes={'Placeholder' : [None, 512], 'Placeholder_1': [None, 512]},\n",
+    "#     output_arrays=['logits'],\n",
+    "# )\n",
+    "# # converter.allow_custom_ops=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# converter.experimental_new_converter = True\n",
+    "# tflite_model = converter.convert()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-float16.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-hybrid.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# interpreter = tf.lite.Interpreter(model_path='tiny-bert-sentiment-hybrid.tflite')\n",
+    "# interpreter.allocate_tensors()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/session/quantization/quantize-entity-model.ipynb b/session/quantization/quantize-entity-model.ipynb
new file mode 100644
index 00000000..7b2c81c6
--- /dev/null
+++ b/session/quantization/quantize-entity-model.ipynb
@@ -0,0 +1,413 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "S3_PATH_ENTITIES = {\n",
+    "    'bert': {\n",
+    "        'model': 'v34/entity/bert-base-entity.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.bert.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.bert.model',\n",
+    "        'setting': 'bert-bahasa/dictionary-entities.json',\n",
+    "    },\n",
+    "    'tiny-bert': {\n",
+    "        'model': 'v34/entity/tiny-bert-entity.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.bert.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.bert.model',\n",
+    "        'setting': 'bert-bahasa/dictionary-entities.json',\n",
+    "    },\n",
+    "    'albert': {\n",
+    "        'model': 'v34/entity/albert-base-entity.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v10.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v10.model',\n",
+    "        'setting': 'bert-bahasa/dictionary-entities.json',\n",
+    "    },\n",
+    "    'tiny-albert': {\n",
+    "        'model': 'v34/entity/albert-tiny-entity.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v10.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v10.model',\n",
+    "        'setting': 'bert-bahasa/dictionary-entities.json',\n",
+    "    },\n",
+    "    'xlnet': {\n",
+    "        'model': 'v34/entity/xlnet-base-entity.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v9.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v9.model',\n",
+    "        'setting': 'bert-bahasa/dictionary-entities.json',\n",
+    "    },\n",
+    "    'alxlnet': {\n",
+    "        'model': 'v34/entity/alxlnet-base-entity.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v9.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v9.model',\n",
+    "        'setting': 'bert-bahasa/dictionary-entities.json',\n",
+    "    },\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bert\n",
+      "tiny-bert\n",
+      "albert\n",
+      "tiny-albert\n",
+      "xlnet\n",
+      "alxlnet\n"
+     ]
+    }
+   ],
+   "source": [
+    "for k in S3_PATH_ENTITIES.keys():\n",
+    "    if k != 'multinomial':\n",
+    "        print(k)\n",
+    "        os.system(f\"wget https://f000.backblazeb2.com/file/malaya-model/{S3_PATH_ENTITIES[k]['model']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.tools.graph_transforms import TransformGraph\n",
+    "from glob import glob\n",
+    "tf.set_random_seed(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['xlnet-base-entity.pb',\n",
+       " 'alxlnet-base-entity.pb',\n",
+       " 'albert-tiny-entity.pb',\n",
+       " 'tiny-bert-entity.pb',\n",
+       " 'bert-base-entity.pb',\n",
+       " 'albert-base-entity.pb']"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pbs = glob('*.pb')\n",
+    "pbs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From <ipython-input-8-648ce936170b>:11: FastGFile.__init__ (from tensorflow.python.platform.gfile) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "Use tf.gfile.GFile.\n",
+      "xlnet-base-entity.pb ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+      "alxlnet-base-entity.pb ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+      "albert-tiny-entity.pb ['Placeholder', 'Placeholder_1']\n",
+      "tiny-bert-entity.pb ['Placeholder', 'Placeholder_1']\n",
+      "bert-base-entity.pb ['Placeholder', 'Placeholder_1']\n",
+      "albert-base-entity.pb ['Placeholder', 'Placeholder_1']\n"
+     ]
+    }
+   ],
+   "source": [
+    "transforms = ['add_default_attributes',\n",
+    "             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n",
+    "             'fold_batch_norms',\n",
+    "             'fold_old_batch_norms',\n",
+    "             'quantize_weights(fallback_min=-10, fallback_max=10)',\n",
+    "             'strip_unused_nodes',\n",
+    "             'sort_by_execution_order']\n",
+    "\n",
+    "for pb in pbs:\n",
+    "    input_graph_def = tf.GraphDef()\n",
+    "    with tf.gfile.FastGFile(pb, 'rb') as f:\n",
+    "        input_graph_def.ParseFromString(f.read())\n",
+    "    \n",
+    "    if 'bert' in pb:\n",
+    "        inputs = ['Placeholder', 'Placeholder_1']\n",
+    "        outputs = ['dense/BiasAdd']\n",
+    "    if 'xlnet'in pb:\n",
+    "        inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+    "        outputs = ['transpose_3']\n",
+    "        \n",
+    "    print(pb, inputs)\n",
+    "    \n",
+    "    transformed_graph_def = TransformGraph(input_graph_def, \n",
+    "                                           inputs,\n",
+    "                                           ['logits'] + outputs, transforms)\n",
+    "    \n",
+    "    with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n",
+    "        f.write(transformed_graph_def.SerializeToString())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_graph(frozen_graph_filename, **kwargs):\n",
+    "    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:\n",
+    "        graph_def = tf.GraphDef()\n",
+    "        graph_def.ParseFromString(f.read())\n",
+    "\n",
+    "    # https://github.com/onnx/tensorflow-onnx/issues/77#issuecomment-445066091\n",
+    "    # to fix import T5\n",
+    "    for node in graph_def.node:\n",
+    "        if node.op == 'RefSwitch':\n",
+    "            node.op = 'Switch'\n",
+    "            for index in xrange(len(node.input)):\n",
+    "                if 'moving_' in node.input[index]:\n",
+    "                    node.input[index] = node.input[index] + '/read'\n",
+    "        elif node.op == 'AssignSub':\n",
+    "            node.op = 'Sub'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "        elif node.op == 'AssignAdd':\n",
+    "            node.op = 'Add'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "        elif node.op == 'Assign':\n",
+    "            node.op = 'Identity'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "            if 'validate_shape' in node.attr:\n",
+    "                del node.attr['validate_shape']\n",
+    "            if len(node.input) == 2:\n",
+    "                node.input[0] = node.input[1]\n",
+    "                del node.input[1]\n",
+    "\n",
+    "    with tf.Graph().as_default() as graph:\n",
+    "        tf.import_graph_def(graph_def)\n",
+    "    return graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "g = load_graph('xlnet-base-entity.pb.quantized')\n",
+    "x = g.get_tensor_by_name('import/Placeholder:0')\n",
+    "x_len = g.get_tensor_by_name('import/Placeholder_1:0')\n",
+    "x_len2 = g.get_tensor_by_name('import/Placeholder_2:0')\n",
+    "logits = g.get_tensor_by_name('import/logits:0')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# x, x_len, logits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_sess = tf.InteractiveSession(graph = g)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 2.58 s, sys: 615 ms, total: 3.19 s\n",
+      "Wall time: 2.68 s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "array([[2, 2, 2, 0, 0]], dtype=int32)"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "test_sess.run(logits, feed_dict = {x: [[1,2,3,3,4]], x_len: [[1,1,1,1,1]],\n",
+    "                                  x_len2: [[1,1,1,1,1]]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%time\n",
+    "# test_sess.run(logits, feed_dict = {x: [[1,2,3,3,4]], x_len: [[1,1,1,1,1]]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['bert-base-entity.pb.quantized',\n",
+       " 'tiny-bert-entity.pb.quantized',\n",
+       " 'alxlnet-base-entity.pb.quantized',\n",
+       " 'xlnet-base-entity.pb.quantized',\n",
+       " 'albert-tiny-entity.pb.quantized',\n",
+       " 'albert-base-entity.pb.quantized']"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "quantized = glob('*.pb.quantized')\n",
+    "quantized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm *.pb*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(\n",
+    "#     graph_def_file='test.pb',\n",
+    "#     input_arrays=['Placeholder', 'Placeholder_1'],\n",
+    "#     input_shapes={'Placeholder' : [None, 512], 'Placeholder_1': [None, 512]},\n",
+    "#     output_arrays=['logits'],\n",
+    "# )\n",
+    "# # converter.allow_custom_ops=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# converter.experimental_new_converter = True\n",
+    "# tflite_model = converter.convert()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-float16.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-hybrid.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# interpreter = tf.lite.Interpreter(model_path='tiny-bert-sentiment-hybrid.tflite')\n",
+    "# interpreter.allocate_tensors()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/session/quantization/quantize-paraphrase-model-transformer.ipynb b/session/quantization/quantize-paraphrase-model-transformer.ipynb
new file mode 100644
index 00000000..8904584e
--- /dev/null
+++ b/session/quantization/quantize-paraphrase-model-transformer.ipynb
@@ -0,0 +1,271 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !wget https://f000.backblazeb2.com/file/malaya-model/v39/paraphrase/base.pb\n",
+    "# !wget https://f000.backblazeb2.com/file/malaya-model/v39/paraphrase/small.pb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.tools.graph_transforms import TransformGraph\n",
+    "from glob import glob\n",
+    "tf.set_random_seed(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['small.pb', 'base.pb']"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pbs = glob('*.pb')\n",
+    "pbs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow_text\n",
+    "import tf_sentencepiece"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From <ipython-input-7-b31cde6b4607>:12: FastGFile.__init__ (from tensorflow.python.platform.gfile) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "Use tf.gfile.GFile.\n",
+      "small.pb\n",
+      "base.pb\n"
+     ]
+    }
+   ],
+   "source": [
+    "transforms = ['add_default_attributes',\n",
+    "             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n",
+    "             'fold_constants(ignore_errors=true)',\n",
+    "             'fold_batch_norms',\n",
+    "             'fold_old_batch_norms',\n",
+    "             'quantize_weights(fallback_min=-10, fallback_max=10)',\n",
+    "             'strip_unused_nodes',\n",
+    "             'sort_by_execution_order']\n",
+    "\n",
+    "for pb in pbs:\n",
+    "    input_graph_def = tf.GraphDef()\n",
+    "    with tf.gfile.FastGFile(pb, 'rb') as f:\n",
+    "        input_graph_def.ParseFromString(f.read())\n",
+    "        \n",
+    "    print(pb)\n",
+    "    \n",
+    "    transformed_graph_def = TransformGraph(input_graph_def, \n",
+    "                                           ['Placeholder', 'Placeholder_2'],\n",
+    "                                           ['greedy', 'beam', 'nucleus'], transforms)\n",
+    "    \n",
+    "    with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n",
+    "        f.write(transformed_graph_def.SerializeToString())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_graph(frozen_graph_filename, **kwargs):\n",
+    "    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:\n",
+    "        graph_def = tf.GraphDef()\n",
+    "        graph_def.ParseFromString(f.read())\n",
+    "\n",
+    "    # https://github.com/onnx/tensorflow-onnx/issues/77#issuecomment-445066091\n",
+    "    # to fix import T5\n",
+    "    for node in graph_def.node:\n",
+    "        if node.op == 'RefSwitch':\n",
+    "            node.op = 'Switch'\n",
+    "            for index in xrange(len(node.input)):\n",
+    "                if 'moving_' in node.input[index]:\n",
+    "                    node.input[index] = node.input[index] + '/read'\n",
+    "        elif node.op == 'AssignSub':\n",
+    "            node.op = 'Sub'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "        elif node.op == 'AssignAdd':\n",
+    "            node.op = 'Add'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "        elif node.op == 'Assign':\n",
+    "            node.op = 'Identity'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "            if 'validate_shape' in node.attr:\n",
+    "                del node.attr['validate_shape']\n",
+    "            if len(node.input) == 2:\n",
+    "                node.input[0] = node.input[1]\n",
+    "                del node.input[1]\n",
+    "\n",
+    "    with tf.Graph().as_default() as graph:\n",
+    "        tf.import_graph_def(graph_def)\n",
+    "    return graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['small.pb.quantized', 'base.pb.quantized']"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "quantized = glob('*.pb.quantized')\n",
+    "quantized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm *.pb*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(\n",
+    "#     graph_def_file='test.pb',\n",
+    "#     input_arrays=['Placeholder', 'Placeholder_1'],\n",
+    "#     input_shapes={'Placeholder' : [None, 512], 'Placeholder_1': [None, 512]},\n",
+    "#     output_arrays=['logits'],\n",
+    "# )\n",
+    "# # converter.allow_custom_ops=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# converter.experimental_new_converter = True\n",
+    "# tflite_model = converter.convert()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-float16.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-hybrid.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# interpreter = tf.lite.Interpreter(model_path='tiny-bert-sentiment-hybrid.tflite')\n",
+    "# interpreter.allocate_tensors()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/session/quantization/quantize-paraphrase-model.ipynb b/session/quantization/quantize-paraphrase-model.ipynb
new file mode 100644
index 00000000..5cc9515c
--- /dev/null
+++ b/session/quantization/quantize-paraphrase-model.ipynb
@@ -0,0 +1,358 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2020-11-12 15:47:30--  https://f000.backblazeb2.com/file/malaya-model/v38/paraphrase/base.pb\n",
+      "Resolving f000.backblazeb2.com (f000.backblazeb2.com)... 104.153.233.177\n",
+      "Connecting to f000.backblazeb2.com (f000.backblazeb2.com)|104.153.233.177|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 \n",
+      "Length: 1252315113 (1.2G) [application/octet-stream]\n",
+      "Saving to: ‘base.pb’\n",
+      "\n",
+      "base.pb             100%[===================>]   1.17G  9.45MB/s    in 1m 58s  \n",
+      "\n",
+      "2020-11-12 15:49:31 (10.1 MB/s) - ‘base.pb’ saved [1252315113/1252315113]\n",
+      "\n",
+      "--2020-11-12 15:49:31--  https://f000.backblazeb2.com/file/malaya-model/v38/paraphrase/small.pb\n",
+      "Resolving f000.backblazeb2.com (f000.backblazeb2.com)... 104.153.233.177\n",
+      "Connecting to f000.backblazeb2.com (f000.backblazeb2.com)|104.153.233.177|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 \n",
+      "Length: 355193291 (339M) [application/octet-stream]\n",
+      "Saving to: ‘small.pb’\n",
+      "\n",
+      "small.pb            100%[===================>] 338.74M  11.3MB/s    in 31s     \n",
+      "\n",
+      "2020-11-12 15:50:04 (11.0 MB/s) - ‘small.pb’ saved [355193291/355193291]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!wget https://f000.backblazeb2.com/file/malaya-model/v38/paraphrase/base.pb\n",
+    "!wget https://f000.backblazeb2.com/file/malaya-model/v38/paraphrase/small.pb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.tools.graph_transforms import TransformGraph\n",
+    "from glob import glob\n",
+    "tf.set_random_seed(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['small.pb', 'base.pb']"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pbs = glob('*.pb')\n",
+    "pbs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow_text\n",
+    "import tf_sentencepiece"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From <ipython-input-6-ed0c3462ae0b>:12: FastGFile.__init__ (from tensorflow.python.platform.gfile) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "Use tf.gfile.GFile.\n",
+      "small.pb\n",
+      "base.pb\n"
+     ]
+    }
+   ],
+   "source": [
+    "transforms = ['add_default_attributes',\n",
+    "             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n",
+    "             'fold_constants(ignore_errors=true)',\n",
+    "             'fold_batch_norms',\n",
+    "             'fold_old_batch_norms',\n",
+    "#              'quantize_weights(fallback_min=-10, fallback_max=10)',\n",
+    "             'strip_unused_nodes',\n",
+    "             'sort_by_execution_order']\n",
+    "\n",
+    "for pb in pbs:\n",
+    "    input_graph_def = tf.GraphDef()\n",
+    "    with tf.gfile.FastGFile(pb, 'rb') as f:\n",
+    "        input_graph_def.ParseFromString(f.read())\n",
+    "        \n",
+    "    print(pb)\n",
+    "    \n",
+    "    transformed_graph_def = TransformGraph(input_graph_def, \n",
+    "                                           ['inputs'],\n",
+    "                                           ['SentenceTokenizer_1/SentenceTokenizer/SentencepieceDetokenizeOp'], transforms)\n",
+    "    \n",
+    "    with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n",
+    "        f.write(transformed_graph_def.SerializeToString())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_graph(frozen_graph_filename, **kwargs):\n",
+    "    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:\n",
+    "        graph_def = tf.GraphDef()\n",
+    "        graph_def.ParseFromString(f.read())\n",
+    "\n",
+    "    # https://github.com/onnx/tensorflow-onnx/issues/77#issuecomment-445066091\n",
+    "    # to fix import T5\n",
+    "    for node in graph_def.node:\n",
+    "        if node.op == 'RefSwitch':\n",
+    "            node.op = 'Switch'\n",
+    "            for index in xrange(len(node.input)):\n",
+    "                if 'moving_' in node.input[index]:\n",
+    "                    node.input[index] = node.input[index] + '/read'\n",
+    "        elif node.op == 'AssignSub':\n",
+    "            node.op = 'Sub'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "        elif node.op == 'AssignAdd':\n",
+    "            node.op = 'Add'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "        elif node.op == 'Assign':\n",
+    "            node.op = 'Identity'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "            if 'validate_shape' in node.attr:\n",
+    "                del node.attr['validate_shape']\n",
+    "            if len(node.input) == 2:\n",
+    "                node.input[0] = node.input[1]\n",
+    "                del node.input[1]\n",
+    "\n",
+    "    with tf.Graph().as_default() as graph:\n",
+    "        tf.import_graph_def(graph_def)\n",
+    "    return graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# g = load_graph('base.pb.quantized')\n",
+    "# x = g.get_tensor_by_name('import/inputs:0')\n",
+    "# logits = g.get_tensor_by_name('import/SentenceTokenizer_1/SentenceTokenizer/SentencepieceDetokenizeOp:0')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# x, x_len, logits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# test_sess = tf.InteractiveSession(graph = g)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%time\n",
+    "# test_sess.run(logits, feed_dict = {x: ['ringkasan: KUALA LUMPUR: Presiden Perancis Emmanuel Macron tidak menampakkan beliau seorang sosok yang bertamadun, selar Tun Dr Mahathir Mohamad menerusi kemas kini terbaharu di blognya. Bekas Perdana Menteri itu mendakwa, pemerintah tertinggi Perancis itu bersikap primitif kerana menuduh orang Islam terlibat dalam pembunuhan guru yang menghina Islam, malah menegaskan tindakan membunuh bukan ajaran Islam. Jelas Dr Mahathir, sejarah membuktikan bahawa orang Perancis pernah membunuh jutaan manusia, yang ramai mangsanya terdiri dari orang Islam.']})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%time\n",
+    "# test_sess.run(logits, feed_dict = {x: [[1,2,3,3,4]], x_len: [[1,1,1,1,1]]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['small.pb.quantized', 'base.pb.quantized']"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "quantized = glob('*.pb.quantized')\n",
+    "quantized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm *.pb*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(\n",
+    "#     graph_def_file='test.pb',\n",
+    "#     input_arrays=['Placeholder', 'Placeholder_1'],\n",
+    "#     input_shapes={'Placeholder' : [None, 512], 'Placeholder_1': [None, 512]},\n",
+    "#     output_arrays=['logits'],\n",
+    "# )\n",
+    "# # converter.allow_custom_ops=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# converter.experimental_new_converter = True\n",
+    "# tflite_model = converter.convert()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-float16.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-hybrid.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# interpreter = tf.lite.Interpreter(model_path='tiny-bert-sentiment-hybrid.tflite')\n",
+    "# interpreter.allocate_tensors()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/session/quantization/quantize-pos-model.ipynb b/session/quantization/quantize-pos-model.ipynb
new file mode 100644
index 00000000..191df7f3
--- /dev/null
+++ b/session/quantization/quantize-pos-model.ipynb
@@ -0,0 +1,413 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "S3_PATH_POS = {\n",
+    "    'bert': {\n",
+    "        'model': 'v34/pos/bert-base-pos.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.bert.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.bert.model',\n",
+    "        'setting': 'bert-bahasa/dictionary-pos.json',\n",
+    "    },\n",
+    "    'tiny-bert': {\n",
+    "        'model': 'v34/pos/tiny-bert-pos.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.bert.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.bert.model',\n",
+    "        'setting': 'bert-bahasa/dictionary-pos.json',\n",
+    "    },\n",
+    "    'albert': {\n",
+    "        'model': 'v34/pos/albert-base-pos.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v10.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v10.model',\n",
+    "        'setting': 'bert-bahasa/dictionary-pos.json',\n",
+    "    },\n",
+    "    'tiny-albert': {\n",
+    "        'model': 'v34/pos/albert-tiny-pos.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v10.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v10.model',\n",
+    "        'setting': 'bert-bahasa/dictionary-pos.json',\n",
+    "    },\n",
+    "    'xlnet': {\n",
+    "        'model': 'v34/pos/xlnet-base-pos.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v9.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v9.model',\n",
+    "        'setting': 'bert-bahasa/dictionary-pos.json',\n",
+    "    },\n",
+    "    'alxlnet': {\n",
+    "        'model': 'v34/pos/alxlnet-base-pos.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v9.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v9.model',\n",
+    "        'setting': 'bert-bahasa/dictionary-pos.json',\n",
+    "    },\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bert\n",
+      "tiny-bert\n",
+      "albert\n",
+      "tiny-albert\n",
+      "xlnet\n",
+      "alxlnet\n"
+     ]
+    }
+   ],
+   "source": [
+    "for k in S3_PATH_POS.keys():\n",
+    "    if k != 'multinomial':\n",
+    "        print(k)\n",
+    "        os.system(f\"wget https://f000.backblazeb2.com/file/malaya-model/{S3_PATH_POS[k]['model']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.tools.graph_transforms import TransformGraph\n",
+    "from glob import glob\n",
+    "tf.set_random_seed(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['bert-base-pos.pb',\n",
+       " 'xlnet-base-pos.pb',\n",
+       " 'alxlnet-base-pos.pb',\n",
+       " 'albert-base-pos.pb',\n",
+       " 'tiny-bert-pos.pb',\n",
+       " 'albert-tiny-pos.pb']"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pbs = glob('*.pb')\n",
+    "pbs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From <ipython-input-6-648ce936170b>:11: FastGFile.__init__ (from tensorflow.python.platform.gfile) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "Use tf.gfile.GFile.\n",
+      "bert-base-pos.pb ['Placeholder', 'Placeholder_1']\n",
+      "xlnet-base-pos.pb ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+      "alxlnet-base-pos.pb ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+      "albert-base-pos.pb ['Placeholder', 'Placeholder_1']\n",
+      "tiny-bert-pos.pb ['Placeholder', 'Placeholder_1']\n",
+      "albert-tiny-pos.pb ['Placeholder', 'Placeholder_1']\n"
+     ]
+    }
+   ],
+   "source": [
+    "transforms = ['add_default_attributes',\n",
+    "             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n",
+    "             'fold_batch_norms',\n",
+    "             'fold_old_batch_norms',\n",
+    "             'quantize_weights(fallback_min=-10, fallback_max=10)',\n",
+    "             'strip_unused_nodes',\n",
+    "             'sort_by_execution_order']\n",
+    "\n",
+    "for pb in pbs:\n",
+    "    input_graph_def = tf.GraphDef()\n",
+    "    with tf.gfile.FastGFile(pb, 'rb') as f:\n",
+    "        input_graph_def.ParseFromString(f.read())\n",
+    "    \n",
+    "    if 'bert' in pb:\n",
+    "        inputs = ['Placeholder', 'Placeholder_1']\n",
+    "        outputs = ['dense/BiasAdd']\n",
+    "    if 'xlnet'in pb:\n",
+    "        inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+    "        outputs = ['transpose_3']\n",
+    "        \n",
+    "    print(pb, inputs)\n",
+    "    \n",
+    "    transformed_graph_def = TransformGraph(input_graph_def, \n",
+    "                                           inputs,\n",
+    "                                           ['logits'] + outputs, transforms)\n",
+    "    \n",
+    "    with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n",
+    "        f.write(transformed_graph_def.SerializeToString())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_graph(frozen_graph_filename, **kwargs):\n",
+    "    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:\n",
+    "        graph_def = tf.GraphDef()\n",
+    "        graph_def.ParseFromString(f.read())\n",
+    "\n",
+    "    # https://github.com/onnx/tensorflow-onnx/issues/77#issuecomment-445066091\n",
+    "    # to fix import T5\n",
+    "    for node in graph_def.node:\n",
+    "        if node.op == 'RefSwitch':\n",
+    "            node.op = 'Switch'\n",
+    "            for index in xrange(len(node.input)):\n",
+    "                if 'moving_' in node.input[index]:\n",
+    "                    node.input[index] = node.input[index] + '/read'\n",
+    "        elif node.op == 'AssignSub':\n",
+    "            node.op = 'Sub'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "        elif node.op == 'AssignAdd':\n",
+    "            node.op = 'Add'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "        elif node.op == 'Assign':\n",
+    "            node.op = 'Identity'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "            if 'validate_shape' in node.attr:\n",
+    "                del node.attr['validate_shape']\n",
+    "            if len(node.input) == 2:\n",
+    "                node.input[0] = node.input[1]\n",
+    "                del node.input[1]\n",
+    "\n",
+    "    with tf.Graph().as_default() as graph:\n",
+    "        tf.import_graph_def(graph_def)\n",
+    "    return graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "g = load_graph('xlnet-base-pos.pb.quantized')\n",
+    "x = g.get_tensor_by_name('import/Placeholder:0')\n",
+    "x_len = g.get_tensor_by_name('import/Placeholder_1:0')\n",
+    "x_len2 = g.get_tensor_by_name('import/Placeholder_2:0')\n",
+    "logits = g.get_tensor_by_name('import/logits:0')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# x, x_len, logits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_sess = tf.InteractiveSession(graph = g)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 2.62 s, sys: 318 ms, total: 2.93 s\n",
+      "Wall time: 2.45 s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "array([[5, 1, 1, 1, 5]], dtype=int32)"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "test_sess.run(logits, feed_dict = {x: [[1,2,3,3,4]], x_len: [[1,1,1,1,1]],\n",
+    "                                  x_len2: [[1,1,1,1,1]]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%time\n",
+    "# test_sess.run(logits, feed_dict = {x: [[1,2,3,3,4]], x_len: [[1,1,1,1,1]]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['alxlnet-base-pos.pb.quantized',\n",
+       " 'xlnet-base-pos.pb.quantized',\n",
+       " 'bert-base-pos.pb.quantized',\n",
+       " 'tiny-bert-pos.pb.quantized',\n",
+       " 'albert-tiny-pos.pb.quantized',\n",
+       " 'albert-base-pos.pb.quantized']"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "quantized = glob('*.pb.quantized')\n",
+    "quantized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm *.pb*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(\n",
+    "#     graph_def_file='test.pb',\n",
+    "#     input_arrays=['Placeholder', 'Placeholder_1'],\n",
+    "#     input_shapes={'Placeholder' : [None, 512], 'Placeholder_1': [None, 512]},\n",
+    "#     output_arrays=['logits'],\n",
+    "# )\n",
+    "# # converter.allow_custom_ops=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# converter.experimental_new_converter = True\n",
+    "# tflite_model = converter.convert()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-float16.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-hybrid.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# interpreter = tf.lite.Interpreter(model_path='tiny-bert-sentiment-hybrid.tflite')\n",
+    "# interpreter.allocate_tensors()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/session/quantization/quantize-relevancy-model.ipynb b/session/quantization/quantize-relevancy-model.ipynb
new file mode 100644
index 00000000..3c210530
--- /dev/null
+++ b/session/quantization/quantize-relevancy-model.ipynb
@@ -0,0 +1,397 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "S3_PATH_RELEVANCY = {\n",
+    "    'bert': {\n",
+    "        'model': 'v40/relevancy/bert-base-relevancy.pb',\n",
+    "        'quantized': 'v40/relevancy/bert-base-relevancy.pb.quantized',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.bert.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.bert.model',\n",
+    "    },\n",
+    "    'tiny-bert': {\n",
+    "        'model': 'v40/relevancy/tiny-bert-relevancy.pb',\n",
+    "        'quantized': 'v40/relevancy/tiny-bert-relevancy.pb.quantized',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.bert.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.bert.model',\n",
+    "    },\n",
+    "    'albert': {\n",
+    "        'model': 'v40/relevancy/albert-base-relevancy.pb',\n",
+    "        'quantized': 'v40/relevancy/albert-base-relevancy.pb.quantized',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v10.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v10.model',\n",
+    "    },\n",
+    "    'tiny-albert': {\n",
+    "        'model': 'v40/relevancy/albert-tiny-relevancy.pb',\n",
+    "        'quantized': 'v40/relevancy/albert-tiny-relevancy.pb.quantized',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v10.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v10.model',\n",
+    "    },\n",
+    "    'xlnet': {\n",
+    "        'model': 'v40/relevancy/xlnet-base-relevancy.pb',\n",
+    "        'quantized': 'v40/relevancy/xlnet-base-relevancy.pb.quantized',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v9.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v9.model',\n",
+    "    },\n",
+    "    'alxlnet': {\n",
+    "        'model': 'v40/relevancy/alxlnet-base-relevancy.pb',\n",
+    "        'quantized': 'v40/relevancy/alxlnet-base-relevancy.pb.quantized',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v9.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v9.model',\n",
+    "    },\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bert\n",
+      "tiny-bert\n",
+      "albert\n",
+      "tiny-albert\n",
+      "xlnet\n",
+      "alxlnet\n"
+     ]
+    }
+   ],
+   "source": [
+    "for k in S3_PATH_RELEVANCY.keys():\n",
+    "    if k != 'multinomial':\n",
+    "        print(k)\n",
+    "        os.system(f\"wget https://f000.backblazeb2.com/file/malaya-model/{S3_PATH_RELEVANCY[k]['model']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.tools.graph_transforms import TransformGraph\n",
+    "from glob import glob\n",
+    "tf.set_random_seed(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['bert-base-relevancy.pb',\n",
+       " 'tiny-bert-relevancy.pb',\n",
+       " 'xlnet-base-relevancy.pb',\n",
+       " 'albert-tiny-relevancy.pb',\n",
+       " 'albert-base-relevancy.pb',\n",
+       " 'alxlnet-base-relevancy.pb']"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pbs = glob('*.pb')\n",
+    "pbs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From <ipython-input-6-969a8c0cffd2>:11: FastGFile.__init__ (from tensorflow.python.platform.gfile) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "Use tf.gfile.GFile.\n",
+      "bert-base-relevancy.pb ['Placeholder', 'Placeholder_1']\n",
+      "tiny-bert-relevancy.pb ['Placeholder', 'Placeholder_1']\n",
+      "xlnet-base-relevancy.pb ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+      "albert-tiny-relevancy.pb ['Placeholder', 'Placeholder_1']\n",
+      "albert-base-relevancy.pb ['Placeholder', 'Placeholder_1']\n",
+      "alxlnet-base-relevancy.pb ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n"
+     ]
+    }
+   ],
+   "source": [
+    "transforms = ['add_default_attributes',\n",
+    "             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n",
+    "             'fold_batch_norms',\n",
+    "             'fold_old_batch_norms',\n",
+    "             'quantize_weights(fallback_min=-10, fallback_max=10)',\n",
+    "             'strip_unused_nodes',\n",
+    "             'sort_by_execution_order']\n",
+    "\n",
+    "for pb in pbs:\n",
+    "    input_graph_def = tf.GraphDef()\n",
+    "    with tf.gfile.FastGFile(pb, 'rb') as f:\n",
+    "        input_graph_def.ParseFromString(f.read())\n",
+    "    \n",
+    "    if 'bert' in pb:\n",
+    "        inputs = ['Placeholder', 'Placeholder_1']\n",
+    "        outputs = ['dense/BiasAdd']\n",
+    "        \n",
+    "    if 'xlnet'in pb:\n",
+    "        inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+    "        outputs = ['transpose_3']\n",
+    "        \n",
+    "    print(pb, inputs)\n",
+    "    \n",
+    "    transformed_graph_def = TransformGraph(input_graph_def, \n",
+    "                                           inputs,\n",
+    "                                           ['logits', 'logits_seq'] + outputs, transforms)\n",
+    "    \n",
+    "    with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n",
+    "        f.write(transformed_graph_def.SerializeToString())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# def load_graph(frozen_graph_filename, **kwargs):\n",
+    "#     with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:\n",
+    "#         graph_def = tf.GraphDef()\n",
+    "#         graph_def.ParseFromString(f.read())\n",
+    "\n",
+    "#     # https://github.com/onnx/tensorflow-onnx/issues/77#issuecomment-445066091\n",
+    "#     # to fix import T5\n",
+    "#     for node in graph_def.node:\n",
+    "#         if node.op == 'RefSwitch':\n",
+    "#             node.op = 'Switch'\n",
+    "#             for index in xrange(len(node.input)):\n",
+    "#                 if 'moving_' in node.input[index]:\n",
+    "#                     node.input[index] = node.input[index] + '/read'\n",
+    "#         elif node.op == 'AssignSub':\n",
+    "#             node.op = 'Sub'\n",
+    "#             if 'use_locking' in node.attr:\n",
+    "#                 del node.attr['use_locking']\n",
+    "#         elif node.op == 'AssignAdd':\n",
+    "#             node.op = 'Add'\n",
+    "#             if 'use_locking' in node.attr:\n",
+    "#                 del node.attr['use_locking']\n",
+    "#         elif node.op == 'Assign':\n",
+    "#             node.op = 'Identity'\n",
+    "#             if 'use_locking' in node.attr:\n",
+    "#                 del node.attr['use_locking']\n",
+    "#             if 'validate_shape' in node.attr:\n",
+    "#                 del node.attr['validate_shape']\n",
+    "#             if len(node.input) == 2:\n",
+    "#                 node.input[0] = node.input[1]\n",
+    "#                 del node.input[1]\n",
+    "\n",
+    "#     with tf.Graph().as_default() as graph:\n",
+    "#         tf.import_graph_def(graph_def)\n",
+    "#     return graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# g = load_graph('xlnet-base-emotion.pb.quantized')\n",
+    "# x = g.get_tensor_by_name('import/Placeholder:0')\n",
+    "# x_len = g.get_tensor_by_name('import/Placeholder_1:0')\n",
+    "# x_len2 = g.get_tensor_by_name('import/Placeholder_2:0')\n",
+    "# logits = g.get_tensor_by_name('import/logits:0')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# x, x_len, logits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# test_sess = tf.InteractiveSession(graph = g)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%time\n",
+    "# test_sess.run(logits, feed_dict = {x: [[1,2,3,3,4]], x_len: [[1,1,1,1,1]],\n",
+    "#                                   x_len2: [[1,1,1,1,1]]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%time\n",
+    "# test_sess.run(logits, feed_dict = {x: [[1,2,3,3,4]], x_len: [[1,1,1,1,1]]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['xlnet-base-relevancy.pb.quantized',\n",
+       " 'alxlnet-base-relevancy.pb.quantized',\n",
+       " 'bert-base-relevancy.pb.quantized',\n",
+       " 'albert-tiny-relevancy.pb.quantized',\n",
+       " 'tiny-bert-relevancy.pb.quantized',\n",
+       " 'albert-base-relevancy.pb.quantized']"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "quantized = glob('*.pb.quantized')\n",
+    "quantized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rm *.pb*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(\n",
+    "#     graph_def_file='test.pb',\n",
+    "#     input_arrays=['Placeholder', 'Placeholder_1'],\n",
+    "#     input_shapes={'Placeholder' : [None, 512], 'Placeholder_1': [None, 512]},\n",
+    "#     output_arrays=['logits'],\n",
+    "# )\n",
+    "# # converter.allow_custom_ops=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# converter.experimental_new_converter = True\n",
+    "# tflite_model = converter.convert()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-float16.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-hybrid.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# interpreter = tf.lite.Interpreter(model_path='tiny-bert-sentiment-hybrid.tflite')\n",
+    "# interpreter.allocate_tensors()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/session/quantization/quantize-sentiment-model.ipynb b/session/quantization/quantize-sentiment-model.ipynb
new file mode 100644
index 00000000..418ef0d5
--- /dev/null
+++ b/session/quantization/quantize-sentiment-model.ipynb
@@ -0,0 +1,409 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "S3_PATH_SENTIMENT = {\n",
+    "    'multinomial': {\n",
+    "        'model': 'v34/sentiment/multinomial.pkl',\n",
+    "        'vector': 'v34/sentiment/tfidf.pkl',\n",
+    "        'bpe': 'v34/sentiment/bpe.model',\n",
+    "    },\n",
+    "    'bert': {\n",
+    "        'model': 'v34/sentiment/bert-base-sentiment.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.bert.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.bert.model',\n",
+    "    },\n",
+    "    'tiny-bert': {\n",
+    "        'model': 'v34/sentiment/tiny-bert-sentiment.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.bert.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.bert.model',\n",
+    "        'quantized': 'v40/sentiment/quantized-tiny-bert-sentiment.pb',\n",
+    "    },\n",
+    "    'albert': {\n",
+    "        'model': 'v34/sentiment/albert-base-sentiment.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v10.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v10.model',\n",
+    "    },\n",
+    "    'tiny-albert': {\n",
+    "        'model': 'v34/sentiment/albert-tiny-sentiment.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v10.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v10.model',\n",
+    "    },\n",
+    "    'xlnet': {\n",
+    "        'model': 'v34/sentiment/xlnet-base-sentiment.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v9.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v9.model',\n",
+    "    },\n",
+    "    'alxlnet': {\n",
+    "        'model': 'v34/sentiment/alxlnet-base-sentiment.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v9.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v9.model',\n",
+    "    },\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bert\n",
+      "tiny-bert\n",
+      "albert\n",
+      "tiny-albert\n",
+      "xlnet\n",
+      "alxlnet\n"
+     ]
+    }
+   ],
+   "source": [
+    "for k in S3_PATH_SENTIMENT.keys():\n",
+    "    if k != 'multinomial':\n",
+    "        print(k)\n",
+    "        os.system(f\"wget https://f000.backblazeb2.com/file/malaya-model/{S3_PATH_SENTIMENT[k]['model']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.tools.graph_transforms import TransformGraph\n",
+    "from glob import glob\n",
+    "tf.set_random_seed(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['albert-tiny-sentiment.pb',\n",
+       " 'xlnet-base-sentiment.pb',\n",
+       " 'albert-base-sentiment.pb',\n",
+       " 'tiny-bert-sentiment.pb',\n",
+       " 'bert-base-sentiment.pb',\n",
+       " 'alxlnet-base-sentiment.pb']"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pbs = glob('*.pb')\n",
+    "pbs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# with tf.gfile.GFile('alxlnet-base-sentiment.pb', \"rb\") as f:\n",
+    "#     graph_def = tf.GraphDef()\n",
+    "#     graph_def.ParseFromString(f.read())\n",
+    "\n",
+    "# with tf.Graph().as_default() as graph:\n",
+    "#     tf.import_graph_def(graph_def)\n",
+    "\n",
+    "# op = graph.get_operations()\n",
+    "# x = []\n",
+    "# for i in op:\n",
+    "#     try:\n",
+    "#         #if 'pooler' in i.values()[0].name:\n",
+    "#         x.append(i.values())\n",
+    "#     except:\n",
+    "#         pass\n",
+    "    \n",
+    "# x[-100:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From <ipython-input-13-969a8c0cffd2>:11: FastGFile.__init__ (from tensorflow.python.platform.gfile) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "Use tf.gfile.GFile.\n",
+      "albert-tiny-sentiment.pb ['Placeholder', 'Placeholder_1']\n",
+      "xlnet-base-sentiment.pb ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+      "albert-base-sentiment.pb ['Placeholder', 'Placeholder_1']\n",
+      "tiny-bert-sentiment.pb ['Placeholder', 'Placeholder_1']\n",
+      "bert-base-sentiment.pb ['Placeholder', 'Placeholder_1']\n",
+      "alxlnet-base-sentiment.pb ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n"
+     ]
+    }
+   ],
+   "source": [
+    "transforms = ['add_default_attributes',\n",
+    "             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n",
+    "             'fold_batch_norms',\n",
+    "             'fold_old_batch_norms',\n",
+    "             'quantize_weights(fallback_min=-10, fallback_max=10)',\n",
+    "             'strip_unused_nodes',\n",
+    "             'sort_by_execution_order']\n",
+    "\n",
+    "for pb in pbs:\n",
+    "    input_graph_def = tf.GraphDef()\n",
+    "    with tf.gfile.FastGFile(pb, 'rb') as f:\n",
+    "        input_graph_def.ParseFromString(f.read())\n",
+    "    \n",
+    "    if 'bert' in pb:\n",
+    "        inputs = ['Placeholder', 'Placeholder_1']\n",
+    "        outputs = ['dense/BiasAdd']\n",
+    "        \n",
+    "    if 'xlnet'in pb:\n",
+    "        inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+    "        outputs = ['transpose_3']\n",
+    "        \n",
+    "    print(pb, inputs)\n",
+    "    \n",
+    "    transformed_graph_def = TransformGraph(input_graph_def, \n",
+    "                                           inputs,\n",
+    "                                           ['logits', 'logits_seq'] + outputs, transforms)\n",
+    "    \n",
+    "    with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n",
+    "        f.write(transformed_graph_def.SerializeToString())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# def load_graph(frozen_graph_filename, **kwargs):\n",
+    "#     with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:\n",
+    "#         graph_def = tf.GraphDef()\n",
+    "#         graph_def.ParseFromString(f.read())\n",
+    "\n",
+    "#     # https://github.com/onnx/tensorflow-onnx/issues/77#issuecomment-445066091\n",
+    "#     # to fix import T5\n",
+    "#     for node in graph_def.node:\n",
+    "#         if node.op == 'RefSwitch':\n",
+    "#             node.op = 'Switch'\n",
+    "#             for index in xrange(len(node.input)):\n",
+    "#                 if 'moving_' in node.input[index]:\n",
+    "#                     node.input[index] = node.input[index] + '/read'\n",
+    "#         elif node.op == 'AssignSub':\n",
+    "#             node.op = 'Sub'\n",
+    "#             if 'use_locking' in node.attr:\n",
+    "#                 del node.attr['use_locking']\n",
+    "#         elif node.op == 'AssignAdd':\n",
+    "#             node.op = 'Add'\n",
+    "#             if 'use_locking' in node.attr:\n",
+    "#                 del node.attr['use_locking']\n",
+    "#         elif node.op == 'Assign':\n",
+    "#             node.op = 'Identity'\n",
+    "#             if 'use_locking' in node.attr:\n",
+    "#                 del node.attr['use_locking']\n",
+    "#             if 'validate_shape' in node.attr:\n",
+    "#                 del node.attr['validate_shape']\n",
+    "#             if len(node.input) == 2:\n",
+    "#                 node.input[0] = node.input[1]\n",
+    "#                 del node.input[1]\n",
+    "\n",
+    "#     with tf.Graph().as_default() as graph:\n",
+    "#         tf.import_graph_def(graph_def)\n",
+    "#     return graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# g = load_graph('test.pb')\n",
+    "# x = g.get_tensor_by_name('import/Placeholder:0')\n",
+    "# x_len = g.get_tensor_by_name('import/Placeholder_1:0')\n",
+    "# logits = g.get_tensor_by_name('import/logits:0')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# x, x_len, logits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# test_sess = tf.InteractiveSession(graph = g)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%time\n",
+    "# test_sess.run(logits, feed_dict = {x: [[1,2,3,3,4]], x_len: [[1,1,1,1,1]]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%time\n",
+    "# test_sess.run(logits, feed_dict = {x: [[1,2,3,3,4]], x_len: [[1,1,1,1,1]]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['albert-base-sentiment.pb.quantized',\n",
+       " 'xlnet-base-sentiment.pb.quantized',\n",
+       " 'albert-tiny-sentiment.pb.quantized',\n",
+       " 'bert-base-sentiment.pb.quantized',\n",
+       " 'alxlnet-base-sentiment.pb.quantized',\n",
+       " 'tiny-bert-sentiment.pb.quantized']"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "quantized = glob('*.pb.quantized')\n",
+    "quantized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(\n",
+    "#     graph_def_file='test.pb',\n",
+    "#     input_arrays=['Placeholder', 'Placeholder_1'],\n",
+    "#     input_shapes={'Placeholder' : [None, 512], 'Placeholder_1': [None, 512]},\n",
+    "#     output_arrays=['logits'],\n",
+    "# )\n",
+    "# # converter.allow_custom_ops=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# converter.experimental_new_converter = True\n",
+    "# tflite_model = converter.convert()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-float16.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-hybrid.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# interpreter = tf.lite.Interpreter(model_path='tiny-bert-sentiment-hybrid.tflite')\n",
+    "# interpreter.allocate_tensors()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/session/quantization/quantize-similarity-model.ipynb b/session/quantization/quantize-similarity-model.ipynb
new file mode 100644
index 00000000..6fab7a24
--- /dev/null
+++ b/session/quantization/quantize-similarity-model.ipynb
@@ -0,0 +1,623 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "S3_PATH_SIMILARITY = {\n",
+    "    'bert': {\n",
+    "        'model': 'v36/similarity/bert-base-similarity.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.bert.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.bert.model',\n",
+    "    },\n",
+    "    'tiny-bert': {\n",
+    "        'model': 'v36/similarity/tiny-bert-similarity.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.bert.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.bert.model',\n",
+    "    },\n",
+    "    'albert': {\n",
+    "        'model': 'v36/similarity/albert-base-similarity.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v10.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v10.model',\n",
+    "    },\n",
+    "    'tiny-albert': {\n",
+    "        'model': 'v36/similarity/albert-tiny-similarity.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v10.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v10.model',\n",
+    "    },\n",
+    "    'xlnet': {\n",
+    "        'model': 'v36/similarity/xlnet-base-similarity.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v9.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v9.model',\n",
+    "    },\n",
+    "    'alxlnet': {\n",
+    "        'model': 'v36/similarity/alxlnet-base-similarity.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v9.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v9.model',\n",
+    "    },\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bert\n",
+      "tiny-bert\n",
+      "albert\n",
+      "tiny-albert\n",
+      "xlnet\n",
+      "alxlnet\n"
+     ]
+    }
+   ],
+   "source": [
+    "for k in S3_PATH_SIMILARITY.keys():\n",
+    "    if k != 'multinomial':\n",
+    "        print(k)\n",
+    "        os.system(f\"wget https://f000.backblazeb2.com/file/malaya-model/{S3_PATH_SIMILARITY[k]['model']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.tools.graph_transforms import TransformGraph\n",
+    "from glob import glob\n",
+    "tf.set_random_seed(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# with tf.gfile.GFile('tiny-bert-similarity.pb', \"rb\") as f:\n",
+    "#     graph_def = tf.GraphDef()\n",
+    "#     graph_def.ParseFromString(f.read())\n",
+    "\n",
+    "# with tf.Graph().as_default() as graph:\n",
+    "#     tf.import_graph_def(graph_def)\n",
+    "\n",
+    "# op = graph.get_operations()\n",
+    "# x = []\n",
+    "# for i in op:\n",
+    "#     try:\n",
+    "#         if i.values()[0].shape[-1] == 312:\n",
+    "#         #if 'import/bert/encoder/layer_11/output/LayerNorm/batchnorm/add' in i.values()[0].name:\n",
+    "#             x.append(i.values())\n",
+    "#     except Exception as e:\n",
+    "#         pass\n",
+    "    \n",
+    "# x[-100:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mapping = {'albert-base-similarity.pb': 'import/bert/encoder/transformer/group_0_11/layer_11/inner_group_0/LayerNorm_1/batchnorm/add_1:0',\n",
+    "          'albert-tiny-similarity.pb': 'import/bert/encoder/transformer/group_0_3/layer_3/inner_group_0/LayerNorm_1/batchnorm/add_1:0',\n",
+    "          'bert-base-similarity.pb': 'import/bert/encoder/layer_11/output/LayerNorm/batchnorm/add_1:0',\n",
+    "          'tiny-bert-similarity.pb': 'import/bert/encoder/layer_3/output/LayerNorm/batchnorm/add_1:0',\n",
+    "          'xlnet-base-similarity.pb': 'import/model/transformer/layer_11/ff/LayerNorm/batchnorm/add_1:0',\n",
+    "          'alxlnet-base-similarity.pb': 'import/model/transformer/layer_shared_11/ff/LayerNorm/batchnorm/add_1:0'}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['albert-tiny-similarity.pb',\n",
+       " 'xlnet-base-similarity.pb',\n",
+       " 'albert-base-similarity.pb',\n",
+       " 'bert-base-similarity.pb',\n",
+       " 'alxlnet-base-similarity.pb',\n",
+       " 'tiny-bert-similarity.pb']"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pbs = glob('*.pb')\n",
+    "pbs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From <ipython-input-16-9f7922d092f8>:11: FastGFile.__init__ (from tensorflow.python.platform.gfile) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "Use tf.gfile.GFile.\n",
+      "albert-tiny-similarity.pb ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+      "xlnet-base-similarity.pb ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+      "albert-base-similarity.pb ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+      "bert-base-similarity.pb ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+      "alxlnet-base-similarity.pb ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+      "tiny-bert-similarity.pb ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n"
+     ]
+    }
+   ],
+   "source": [
+    "transforms = ['add_default_attributes',\n",
+    "             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n",
+    "             'fold_batch_norms',\n",
+    "             'fold_old_batch_norms',\n",
+    "             'quantize_weights(fallback_min=-10, fallback_max=10)',\n",
+    "             'strip_unused_nodes',\n",
+    "             'sort_by_execution_order']\n",
+    "\n",
+    "for pb in pbs:\n",
+    "    input_graph_def = tf.GraphDef()\n",
+    "    with tf.gfile.FastGFile(pb, 'rb') as f:\n",
+    "        input_graph_def.ParseFromString(f.read())\n",
+    "        \n",
+    "    inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+    "    \n",
+    "    if 'bert' in pb:\n",
+    "        outputs = ['logits', 'bert/pooler/dense/BiasAdd']\n",
+    "        \n",
+    "    if 'xlnet'in pb:\n",
+    "        outputs = ['logits', 'model_1/sequnece_summary/summary/BiasAdd']\n",
+    "        \n",
+    "    a = [mapping[pb].replace('import/','').replace(':0','')]\n",
+    "        \n",
+    "    print(pb, inputs)\n",
+    "    \n",
+    "    transformed_graph_def = TransformGraph(input_graph_def, \n",
+    "                                           inputs,\n",
+    "                                           outputs + a, transforms)\n",
+    "    \n",
+    "    with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n",
+    "        f.write(transformed_graph_def.SerializeToString())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_graph(frozen_graph_filename, **kwargs):\n",
+    "    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:\n",
+    "        graph_def = tf.GraphDef()\n",
+    "        graph_def.ParseFromString(f.read())\n",
+    "\n",
+    "    # https://github.com/onnx/tensorflow-onnx/issues/77#issuecomment-445066091\n",
+    "    # to fix import T5\n",
+    "    for node in graph_def.node:\n",
+    "        if node.op == 'RefSwitch':\n",
+    "            node.op = 'Switch'\n",
+    "            for index in xrange(len(node.input)):\n",
+    "                if 'moving_' in node.input[index]:\n",
+    "                    node.input[index] = node.input[index] + '/read'\n",
+    "        elif node.op == 'AssignSub':\n",
+    "            node.op = 'Sub'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "        elif node.op == 'AssignAdd':\n",
+    "            node.op = 'Add'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "        elif node.op == 'Assign':\n",
+    "            node.op = 'Identity'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "            if 'validate_shape' in node.attr:\n",
+    "                del node.attr['validate_shape']\n",
+    "            if len(node.input) == 2:\n",
+    "                node.input[0] = node.input[1]\n",
+    "                del node.input[1]\n",
+    "\n",
+    "    with tf.Graph().as_default() as graph:\n",
+    "        tf.import_graph_def(graph_def)\n",
+    "    return graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "g = load_graph('tiny-bert-similarity.pb')\n",
+    "x = g.get_tensor_by_name('import/Placeholder:0')\n",
+    "segment_ids = g.get_tensor_by_name('import/Placeholder_1:0')\n",
+    "input_masks = g.get_tensor_by_name('import/Placeholder_2:0')\n",
+    "logits = g.get_tensor_by_name(mapping['tiny-bert-similarity.pb'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<tf.Tensor 'import/bert/encoder/layer_3/output/LayerNorm/batchnorm/add_1:0' shape=(?, 312) dtype=float32>"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "logits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_sess = tf.InteractiveSession(graph = g)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/malaya/function/__init__.py:50: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import malaya\n",
+    "\n",
+    "model = malaya.similarity.transformer(model = 'alxlnet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from malaya.text.bpe import xlnet_tokenization\n",
+    "import numpy as np\n",
+    "\n",
+    "r = xlnet_tokenization(model._tokenizer, ['benci', 'suka', 'hodoh la', 'sakai bodoh la la la la'])\n",
+    "batch_x = r[0]\n",
+    "batch_mask = r[1]\n",
+    "batch_segment = np.array(r[2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[1, 1, 2, 4, 4, 4, 4, 4, 4],\n",
+       "       [1, 1, 2, 4, 4, 4, 4, 4, 4],\n",
+       "       [1, 1, 1, 2, 4, 4, 4, 4, 4],\n",
+       "       [1, 1, 1, 1, 1, 1, 1, 1, 2]])"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "batch_segment[batch_segment == 0 ] = 1\n",
+    "batch_segment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(4, 9)"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.array(batch_x).shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 70 ms, sys: 4.86 ms, total: 74.9 ms\n",
+      "Wall time: 15.9 ms\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(36, 312)"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "l = test_sess.run(logits, feed_dict = {x: batch_x,\n",
+    "                                  segment_ids: batch_segment,\n",
+    "                                  input_masks: batch_mask})\n",
+    "l.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(l.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(4, 9, 312)"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "l.reshape((*np.array(batch_x).shape,-1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "l = test_sess.run(logits, feed_dict = {x: batch_x,\n",
+    "                                  segment_ids: batch_segment,\n",
+    "                                  input_masks: batch_mask})\n",
+    "l.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# x, x_len, logits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%time\n",
+    "# test_sess.run(logits, feed_dict = {x: [[1,2,3,3,4]], x_len: [[1,1,1,1,1]]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%time\n",
+    "# test_sess.run(logits, feed_dict = {x: [[1,2,3,3,4]], x_len: [[1,1,1,1,1]]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['albert-base-similarity.pb.quantized',\n",
+       " 'albert-tiny-similarity.pb.quantized',\n",
+       " 'bert-base-similarity.pb.quantized',\n",
+       " 'xlnet-base-similarity.pb.quantized',\n",
+       " 'tiny-bert-similarity.pb.quantized',\n",
+       " 'alxlnet-base-similarity.pb.quantized']"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "quantized = glob('*.pb.quantized')\n",
+    "quantized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rm *.pb*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(\n",
+    "#     graph_def_file='test.pb',\n",
+    "#     input_arrays=['Placeholder', 'Placeholder_1'],\n",
+    "#     input_shapes={'Placeholder' : [None, 512], 'Placeholder_1': [None, 512]},\n",
+    "#     output_arrays=['logits'],\n",
+    "# )\n",
+    "# # converter.allow_custom_ops=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# converter.experimental_new_converter = True\n",
+    "# tflite_model = converter.convert()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-float16.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-hybrid.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# interpreter = tf.lite.Interpreter(model_path='tiny-bert-sentiment-hybrid.tflite')\n",
+    "# interpreter.allocate_tensors()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/session/quantization/quantize-stem-model.ipynb b/session/quantization/quantize-stem-model.ipynb
new file mode 100644
index 00000000..1835d65f
--- /dev/null
+++ b/session/quantization/quantize-stem-model.ipynb
@@ -0,0 +1,197 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !wget https://f000.backblazeb2.com/file/malaya-model/v34/stem/model.pb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.tools.graph_transforms import TransformGraph\n",
+    "from tensorflow.contrib.seq2seq.python.ops import beam_search_ops\n",
+    "from glob import glob\n",
+    "tf.set_random_seed(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['pretrained-speakernet.pb',\n",
+       " 'pretrained-vggvox-v1.pb',\n",
+       " 'pretrained-vggvox-v2.pb',\n",
+       " 'pretrained-deep-speaker.pb',\n",
+       " 'model.pb']"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pbs = glob('*.pb')\n",
+    "pbs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From <ipython-input-7-5104ad4c8d58>:11: FastGFile.__init__ (from tensorflow.python.platform.gfile) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "Use tf.gfile.GFile.\n"
+     ]
+    }
+   ],
+   "source": [
+    "transforms = ['add_default_attributes',\n",
+    "             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n",
+    "             'fold_batch_norms',\n",
+    "             'fold_old_batch_norms',\n",
+    "             'quantize_weights(fallback_min=-10, fallback_max=10)',\n",
+    "             'strip_unused_nodes',\n",
+    "             'sort_by_execution_order']\n",
+    "\n",
+    "for pb in ['model.pb']:\n",
+    "    input_graph_def = tf.GraphDef()\n",
+    "    with tf.gfile.FastGFile(pb, 'rb') as f:\n",
+    "        input_graph_def.ParseFromString(f.read())\n",
+    "    \n",
+    "    transformed_graph_def = TransformGraph(input_graph_def, \n",
+    "                                           ['Placeholder'],\n",
+    "                                           ['decode_1/greedy', 'decode_2/beam'], transforms)\n",
+    "    \n",
+    "    with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n",
+    "        f.write(transformed_graph_def.SerializeToString())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm *.pb*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(\n",
+    "#     graph_def_file='test.pb',\n",
+    "#     input_arrays=['Placeholder', 'Placeholder_1'],\n",
+    "#     input_shapes={'Placeholder' : [None, 512], 'Placeholder_1': [None, 512]},\n",
+    "#     output_arrays=['logits'],\n",
+    "# )\n",
+    "# # converter.allow_custom_ops=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# converter.experimental_new_converter = True\n",
+    "# tflite_model = converter.convert()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-float16.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-hybrid.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# interpreter = tf.lite.Interpreter(model_path='tiny-bert-sentiment-hybrid.tflite')\n",
+    "# interpreter.allocate_tensors()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/session/quantization/quantize-subjectivity-model.ipynb b/session/quantization/quantize-subjectivity-model.ipynb
new file mode 100644
index 00000000..6a481b2d
--- /dev/null
+++ b/session/quantization/quantize-subjectivity-model.ipynb
@@ -0,0 +1,394 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "S3_PATH_SUBJECTIVE = {\n",
+    "    'multinomial': {\n",
+    "        'model': 'v34/subjective/multinomial.pkl',\n",
+    "        'vector': 'v34/subjective/tfidf.pkl',\n",
+    "        'bpe': 'v34/subjective/bpe.model',\n",
+    "    },\n",
+    "    'bert': {\n",
+    "        'model': 'v34/subjective/bert-base-subjective.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.bert.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.bert.model',\n",
+    "    },\n",
+    "    'tiny-bert': {\n",
+    "        'model': 'v34/subjective/tiny-bert-subjective.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.bert.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.bert.model',\n",
+    "    },\n",
+    "    'albert': {\n",
+    "        'model': 'v34/subjective/albert-base-subjective.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v10.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v10.model',\n",
+    "    },\n",
+    "    'tiny-albert': {\n",
+    "        'model': 'v34/subjective/albert-tiny-subjective.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v10.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v10.model',\n",
+    "    },\n",
+    "    'xlnet': {\n",
+    "        'model': 'v34/subjective/xlnet-base-subjective.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v9.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v9.model',\n",
+    "    },\n",
+    "    'alxlnet': {\n",
+    "        'model': 'v34/subjective/alxlnet-base-subjective.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v9.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v9.model',\n",
+    "    },\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bert\n",
+      "tiny-bert\n",
+      "albert\n",
+      "tiny-albert\n",
+      "xlnet\n",
+      "alxlnet\n"
+     ]
+    }
+   ],
+   "source": [
+    "for k in S3_PATH_SUBJECTIVE.keys():\n",
+    "    if k != 'multinomial':\n",
+    "        print(k)\n",
+    "        os.system(f\"wget https://f000.backblazeb2.com/file/malaya-model/{S3_PATH_SUBJECTIVE[k]['model']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.tools.graph_transforms import TransformGraph\n",
+    "from glob import glob\n",
+    "tf.set_random_seed(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['albert-base-subjective.pb',\n",
+       " 'xlnet-base-subjective.pb',\n",
+       " 'albert-tiny-subjective.pb',\n",
+       " 'bert-base-subjective.pb',\n",
+       " 'alxlnet-base-subjective.pb',\n",
+       " 'tiny-bert-subjective.pb']"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pbs = glob('*.pb')\n",
+    "pbs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From <ipython-input-6-969a8c0cffd2>:11: FastGFile.__init__ (from tensorflow.python.platform.gfile) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "Use tf.gfile.GFile.\n",
+      "albert-base-subjective.pb ['Placeholder', 'Placeholder_1']\n",
+      "xlnet-base-subjective.pb ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+      "albert-tiny-subjective.pb ['Placeholder', 'Placeholder_1']\n",
+      "bert-base-subjective.pb ['Placeholder', 'Placeholder_1']\n",
+      "alxlnet-base-subjective.pb ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+      "tiny-bert-subjective.pb ['Placeholder', 'Placeholder_1']\n"
+     ]
+    }
+   ],
+   "source": [
+    "transforms = ['add_default_attributes',\n",
+    "             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n",
+    "             'fold_batch_norms',\n",
+    "             'fold_old_batch_norms',\n",
+    "             'quantize_weights(fallback_min=-10, fallback_max=10)',\n",
+    "             'strip_unused_nodes',\n",
+    "             'sort_by_execution_order']\n",
+    "\n",
+    "for pb in pbs:\n",
+    "    input_graph_def = tf.GraphDef()\n",
+    "    with tf.gfile.FastGFile(pb, 'rb') as f:\n",
+    "        input_graph_def.ParseFromString(f.read())\n",
+    "    \n",
+    "    if 'bert' in pb:\n",
+    "        inputs = ['Placeholder', 'Placeholder_1']\n",
+    "        outputs = ['dense/BiasAdd']\n",
+    "        \n",
+    "    if 'xlnet'in pb:\n",
+    "        inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+    "        outputs = ['transpose_3']\n",
+    "        \n",
+    "    print(pb, inputs)\n",
+    "    \n",
+    "    transformed_graph_def = TransformGraph(input_graph_def, \n",
+    "                                           inputs,\n",
+    "                                           ['logits', 'logits_seq'] + outputs, transforms)\n",
+    "    \n",
+    "    with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n",
+    "        f.write(transformed_graph_def.SerializeToString())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# def load_graph(frozen_graph_filename, **kwargs):\n",
+    "#     with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:\n",
+    "#         graph_def = tf.GraphDef()\n",
+    "#         graph_def.ParseFromString(f.read())\n",
+    "\n",
+    "#     # https://github.com/onnx/tensorflow-onnx/issues/77#issuecomment-445066091\n",
+    "#     # to fix import T5\n",
+    "#     for node in graph_def.node:\n",
+    "#         if node.op == 'RefSwitch':\n",
+    "#             node.op = 'Switch'\n",
+    "#             for index in xrange(len(node.input)):\n",
+    "#                 if 'moving_' in node.input[index]:\n",
+    "#                     node.input[index] = node.input[index] + '/read'\n",
+    "#         elif node.op == 'AssignSub':\n",
+    "#             node.op = 'Sub'\n",
+    "#             if 'use_locking' in node.attr:\n",
+    "#                 del node.attr['use_locking']\n",
+    "#         elif node.op == 'AssignAdd':\n",
+    "#             node.op = 'Add'\n",
+    "#             if 'use_locking' in node.attr:\n",
+    "#                 del node.attr['use_locking']\n",
+    "#         elif node.op == 'Assign':\n",
+    "#             node.op = 'Identity'\n",
+    "#             if 'use_locking' in node.attr:\n",
+    "#                 del node.attr['use_locking']\n",
+    "#             if 'validate_shape' in node.attr:\n",
+    "#                 del node.attr['validate_shape']\n",
+    "#             if len(node.input) == 2:\n",
+    "#                 node.input[0] = node.input[1]\n",
+    "#                 del node.input[1]\n",
+    "\n",
+    "#     with tf.Graph().as_default() as graph:\n",
+    "#         tf.import_graph_def(graph_def)\n",
+    "#     return graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# g = load_graph('xlnet-base-emotion.pb.quantized')\n",
+    "# x = g.get_tensor_by_name('import/Placeholder:0')\n",
+    "# x_len = g.get_tensor_by_name('import/Placeholder_1:0')\n",
+    "# x_len2 = g.get_tensor_by_name('import/Placeholder_2:0')\n",
+    "# logits = g.get_tensor_by_name('import/logits:0')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# x, x_len, logits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# test_sess = tf.InteractiveSession(graph = g)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%time\n",
+    "# test_sess.run(logits, feed_dict = {x: [[1,2,3,3,4]], x_len: [[1,1,1,1,1]],\n",
+    "#                                   x_len2: [[1,1,1,1,1]]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%time\n",
+    "# test_sess.run(logits, feed_dict = {x: [[1,2,3,3,4]], x_len: [[1,1,1,1,1]]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['xlnet-base-subjective.pb.quantized',\n",
+       " 'alxlnet-base-subjective.pb.quantized',\n",
+       " 'albert-base-subjective.pb.quantized',\n",
+       " 'bert-base-subjective.pb.quantized',\n",
+       " 'albert-tiny-subjective.pb.quantized',\n",
+       " 'tiny-bert-subjective.pb.quantized']"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "quantized = glob('*.pb.quantized')\n",
+    "quantized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm *.pb*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(\n",
+    "#     graph_def_file='test.pb',\n",
+    "#     input_arrays=['Placeholder', 'Placeholder_1'],\n",
+    "#     input_shapes={'Placeholder' : [None, 512], 'Placeholder_1': [None, 512]},\n",
+    "#     output_arrays=['logits'],\n",
+    "# )\n",
+    "# # converter.allow_custom_ops=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# converter.experimental_new_converter = True\n",
+    "# tflite_model = converter.convert()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-float16.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-hybrid.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# interpreter = tf.lite.Interpreter(model_path='tiny-bert-sentiment-hybrid.tflite')\n",
+    "# interpreter.allocate_tensors()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/session/quantization/quantize-summarization-abstractive-model-transformer.ipynb b/session/quantization/quantize-summarization-abstractive-model-transformer.ipynb
new file mode 100644
index 00000000..c16c6b16
--- /dev/null
+++ b/session/quantization/quantize-summarization-abstractive-model-transformer.ipynb
@@ -0,0 +1,421 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2020-11-01 22:13:38--  https://f000.backblazeb2.com/file/malaya-model/v39/summarization/base.pb\n",
+      "Resolving f000.backblazeb2.com (f000.backblazeb2.com)... 104.153.233.177\n",
+      "Connecting to f000.backblazeb2.com (f000.backblazeb2.com)|104.153.233.177|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 \n",
+      "Length: 831717841 (793M) [application/octet-stream]\n",
+      "Saving to: ‘base.pb’\n",
+      "\n",
+      "base.pb             100%[===================>] 793.19M  14.1MB/s    in 76s     \n",
+      "\n",
+      "2020-11-01 22:14:55 (10.5 MB/s) - ‘base.pb’ saved [831717841/831717841]\n",
+      "\n",
+      "--2020-11-01 22:14:56--  https://f000.backblazeb2.com/file/malaya-model/v39/summarization/small.pb\n",
+      "Resolving f000.backblazeb2.com (f000.backblazeb2.com)... 104.153.233.177\n",
+      "Connecting to f000.backblazeb2.com (f000.backblazeb2.com)|104.153.233.177|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 \n",
+      "Length: 378870799 (361M) [application/octet-stream]\n",
+      "Saving to: ‘small.pb’\n",
+      "\n",
+      "small.pb            100%[===================>] 361.32M  12.2MB/s    in 38s     \n",
+      "\n",
+      "2020-11-01 22:15:35 (9.61 MB/s) - ‘small.pb’ saved [378870799/378870799]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!wget https://f000.backblazeb2.com/file/malaya-model/v39/summarization/base.pb\n",
+    "!wget https://f000.backblazeb2.com/file/malaya-model/v39/summarization/small.pb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.tools.graph_transforms import TransformGraph\n",
+    "from glob import glob\n",
+    "tf.set_random_seed(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['small.pb', 'base.pb']"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pbs = glob('*.pb')\n",
+    "pbs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow_text\n",
+    "import tf_sentencepiece"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "small.pb\n",
+      "base.pb\n"
+     ]
+    }
+   ],
+   "source": [
+    "transforms = ['add_default_attributes',\n",
+    "             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n",
+    "             'fold_constants(ignore_errors=true)',\n",
+    "             'fold_batch_norms',\n",
+    "             'fold_old_batch_norms',\n",
+    "             'quantize_weights(fallback_min=-10, fallback_max=10)',\n",
+    "             'strip_unused_nodes',\n",
+    "             'sort_by_execution_order']\n",
+    "\n",
+    "for pb in pbs:\n",
+    "    input_graph_def = tf.GraphDef()\n",
+    "    with tf.gfile.FastGFile(pb, 'rb') as f:\n",
+    "        input_graph_def.ParseFromString(f.read())\n",
+    "        \n",
+    "    print(pb)\n",
+    "    \n",
+    "    transformed_graph_def = TransformGraph(input_graph_def, \n",
+    "                                           ['Placeholder', 'Placeholder_2'],\n",
+    "                                           ['greedy', 'beam', 'nucleus'], transforms)\n",
+    "    \n",
+    "    with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n",
+    "        f.write(transformed_graph_def.SerializeToString())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_graph(frozen_graph_filename, **kwargs):\n",
+    "    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:\n",
+    "        graph_def = tf.GraphDef()\n",
+    "        graph_def.ParseFromString(f.read())\n",
+    "\n",
+    "    # https://github.com/onnx/tensorflow-onnx/issues/77#issuecomment-445066091\n",
+    "    # to fix import T5\n",
+    "    for node in graph_def.node:\n",
+    "        if node.op == 'RefSwitch':\n",
+    "            node.op = 'Switch'\n",
+    "            for index in xrange(len(node.input)):\n",
+    "                if 'moving_' in node.input[index]:\n",
+    "                    node.input[index] = node.input[index] + '/read'\n",
+    "        elif node.op == 'AssignSub':\n",
+    "            node.op = 'Sub'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "        elif node.op == 'AssignAdd':\n",
+    "            node.op = 'Add'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "        elif node.op == 'Assign':\n",
+    "            node.op = 'Identity'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "            if 'validate_shape' in node.attr:\n",
+    "                del node.attr['validate_shape']\n",
+    "            if len(node.input) == 2:\n",
+    "                node.input[0] = node.input[1]\n",
+    "                del node.input[1]\n",
+    "\n",
+    "    with tf.Graph().as_default() as graph:\n",
+    "        tf.import_graph_def(graph_def)\n",
+    "    return graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "downloading frozen /home/husein/Malaya/summarize/transformer/small model\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "362MB [00:29, 12.4MB/s]                          \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "downloading frozen /home/husein/Malaya/summarize/transformer/small vocab\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "132%|██████████| 1.00/0.76 [00:01<00:00, 1.04s/MB]\n",
+      "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/malaya/function/__init__.py:50: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.\n",
+      "\n",
+      "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/malaya/function/__init__.py:65: The name tf.InteractiveSession is deprecated. Please use tf.compat.v1.InteractiveSession instead.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import malaya\n",
+    "\n",
+    "model = malaya.summarization.abstractive.transformer(model = 'small')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "string = 'KUALA LUMPUR: Presiden Perancis Emmanuel Macron tidak menampakkan beliau seorang sosok yang bertamadun, selar Tun Dr Mahathir Mohamad menerusi kemas kini terbaharu di blognya. Bekas Perdana Menteri itu mendakwa, pemerintah tertinggi Perancis itu bersikap primitif kerana menuduh orang Islam terlibat dalam pembunuhan guru yang menghina Islam, malah menegaskan tindakan membunuh bukan ajaran Islam. Jelas Dr Mahathir, sejarah membuktikan bahawa orang Perancis pernah membunuh jutaan manusia, yang ramai mangsanya terdiri dari orang Islam.'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Presiden Perancis tidak menampakkan figur seorang lelaki yang bertanggungjawab']"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.summarize([string])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "e = model._tokenizer.encode(f'ringkasan: {string}')\n",
+    "e = e + [1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "g = load_graph('small.pb.quantized')\n",
+    "x = g.get_tensor_by_name('import/Placeholder:0')\n",
+    "logits = g.get_tensor_by_name('import/greedy:0')\n",
+    "test_sess = tf.InteractiveSession(graph = g)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "l = test_sess.run(logits, feed_dict = {x: [e]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Presiden Perancis Emmanuel Macron tidak menampakkan sosok yang menyimpang, selar Tun M'"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model._tokenizer.decode(l[0].tolist())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['small.pb.quantized', 'base.pb.quantized']"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "quantized = glob('*.pb.quantized')\n",
+    "quantized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm *.pb*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(\n",
+    "#     graph_def_file='test.pb',\n",
+    "#     input_arrays=['Placeholder', 'Placeholder_1'],\n",
+    "#     input_shapes={'Placeholder' : [None, 512], 'Placeholder_1': [None, 512]},\n",
+    "#     output_arrays=['logits'],\n",
+    "# )\n",
+    "# # converter.allow_custom_ops=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# converter.experimental_new_converter = True\n",
+    "# tflite_model = converter.convert()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-float16.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-hybrid.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# interpreter = tf.lite.Interpreter(model_path='tiny-bert-sentiment-hybrid.tflite')\n",
+    "# interpreter.allocate_tensors()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/session/quantization/quantize-summarization-abstractive-model.ipynb b/session/quantization/quantize-summarization-abstractive-model.ipynb
new file mode 100644
index 00000000..75f0248e
--- /dev/null
+++ b/session/quantization/quantize-summarization-abstractive-model.ipynb
@@ -0,0 +1,386 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2020-11-01 17:33:39--  https://f000.backblazeb2.com/file/malaya-model/v38/summarize/base.pb\n",
+      "Resolving f000.backblazeb2.com (f000.backblazeb2.com)... 104.153.233.177\n",
+      "Connecting to f000.backblazeb2.com (f000.backblazeb2.com)|104.153.233.177|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 \n",
+      "Length: 1252381816 (1.2G) [application/octet-stream]\n",
+      "Saving to: ‘base.pb’\n",
+      "\n",
+      "base.pb             100%[===================>]   1.17G  7.82MB/s    in 1m 50s  \n",
+      "\n",
+      "2020-11-01 17:35:31 (10.9 MB/s) - ‘base.pb’ saved [1252381816/1252381816]\n",
+      "\n",
+      "--2020-11-01 17:35:32--  https://f000.backblazeb2.com/file/malaya-model/v38/summarize/small.pb\n",
+      "Resolving f000.backblazeb2.com (f000.backblazeb2.com)... 104.153.233.177\n",
+      "Connecting to f000.backblazeb2.com (f000.backblazeb2.com)|104.153.233.177|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 \n",
+      "Length: 355538101 (339M) [application/octet-stream]\n",
+      "Saving to: ‘small.pb’\n",
+      "\n",
+      "small.pb            100%[===================>] 339.07M  13.9MB/s    in 29s     \n",
+      "\n",
+      "2020-11-01 17:36:03 (11.7 MB/s) - ‘small.pb’ saved [355538101/355538101]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!wget https://f000.backblazeb2.com/file/malaya-model/v38/summarize/base.pb\n",
+    "!wget https://f000.backblazeb2.com/file/malaya-model/v38/summarize/small.pb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.tools.graph_transforms import TransformGraph\n",
+    "from glob import glob\n",
+    "tf.set_random_seed(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['small.pb', 'base.pb']"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pbs = glob('*.pb')\n",
+    "pbs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow_text\n",
+    "import tf_sentencepiece"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "small.pb\n",
+      "base.pb\n"
+     ]
+    }
+   ],
+   "source": [
+    "transforms = ['add_default_attributes',\n",
+    "             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n",
+    "             'fold_constants(ignore_errors=true)',\n",
+    "             'fold_batch_norms',\n",
+    "             'fold_old_batch_norms',\n",
+    "#              'quantize_weights(fallback_min=-10, fallback_max=10)',\n",
+    "             'strip_unused_nodes',\n",
+    "             'sort_by_execution_order']\n",
+    "\n",
+    "for pb in pbs:\n",
+    "    input_graph_def = tf.GraphDef()\n",
+    "    with tf.gfile.FastGFile(pb, 'rb') as f:\n",
+    "        input_graph_def.ParseFromString(f.read())\n",
+    "        \n",
+    "    print(pb)\n",
+    "    \n",
+    "    transformed_graph_def = TransformGraph(input_graph_def, \n",
+    "                                           ['inputs'],\n",
+    "                                           ['SentenceTokenizer_1/SentenceTokenizer/SentencepieceDetokenizeOp'], transforms)\n",
+    "    \n",
+    "    with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n",
+    "        f.write(transformed_graph_def.SerializeToString())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_graph(frozen_graph_filename, **kwargs):\n",
+    "    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:\n",
+    "        graph_def = tf.GraphDef()\n",
+    "        graph_def.ParseFromString(f.read())\n",
+    "\n",
+    "    # https://github.com/onnx/tensorflow-onnx/issues/77#issuecomment-445066091\n",
+    "    # to fix import T5\n",
+    "    for node in graph_def.node:\n",
+    "        if node.op == 'RefSwitch':\n",
+    "            node.op = 'Switch'\n",
+    "            for index in xrange(len(node.input)):\n",
+    "                if 'moving_' in node.input[index]:\n",
+    "                    node.input[index] = node.input[index] + '/read'\n",
+    "        elif node.op == 'AssignSub':\n",
+    "            node.op = 'Sub'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "        elif node.op == 'AssignAdd':\n",
+    "            node.op = 'Add'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "        elif node.op == 'Assign':\n",
+    "            node.op = 'Identity'\n",
+    "            if 'use_locking' in node.attr:\n",
+    "                del node.attr['use_locking']\n",
+    "            if 'validate_shape' in node.attr:\n",
+    "                del node.attr['validate_shape']\n",
+    "            if len(node.input) == 2:\n",
+    "                node.input[0] = node.input[1]\n",
+    "                del node.input[1]\n",
+    "\n",
+    "    with tf.Graph().as_default() as graph:\n",
+    "        tf.import_graph_def(graph_def)\n",
+    "    return graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "g = load_graph('base.pb.quantized')\n",
+    "x = g.get_tensor_by_name('import/inputs:0')\n",
+    "logits = g.get_tensor_by_name('import/SentenceTokenizer_1/SentenceTokenizer/SentencepieceDetokenizeOp:0')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# x, x_len, logits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_sess = tf.InteractiveSession(graph = g)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<tf.Tensor 'import/inputs:0' shape=(?,) dtype=string>"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 1min 5s, sys: 12.8 s, total: 1min 18s\n",
+      "Wall time: 14.8 s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "array([b'Presiden Perancis Emmanuel Macron tidak menunjukkan dia seorang yang bertamadun, kata Dr Mahathir. Macron mengatakan kerajaannya bersikap primitif dalam menuduh orang Islam melakukan pembunuhan. Dr Mahathir: Sejarah membuktikan bahawa orang Perancis pernah membunuh berjuta-juta orang'],\n",
+       "      dtype=object)"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "test_sess.run(logits, feed_dict = {x: ['ringkasan: KUALA LUMPUR: Presiden Perancis Emmanuel Macron tidak menampakkan beliau seorang sosok yang bertamadun, selar Tun Dr Mahathir Mohamad menerusi kemas kini terbaharu di blognya. Bekas Perdana Menteri itu mendakwa, pemerintah tertinggi Perancis itu bersikap primitif kerana menuduh orang Islam terlibat dalam pembunuhan guru yang menghina Islam, malah menegaskan tindakan membunuh bukan ajaran Islam. Jelas Dr Mahathir, sejarah membuktikan bahawa orang Perancis pernah membunuh jutaan manusia, yang ramai mangsanya terdiri dari orang Islam.']})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%time\n",
+    "# test_sess.run(logits, feed_dict = {x: [[1,2,3,3,4]], x_len: [[1,1,1,1,1]]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['small.pb.quantized', 'base.pb.quantized']"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "quantized = glob('*.pb.quantized')\n",
+    "quantized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm *.pb*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(\n",
+    "#     graph_def_file='test.pb',\n",
+    "#     input_arrays=['Placeholder', 'Placeholder_1'],\n",
+    "#     input_shapes={'Placeholder' : [None, 512], 'Placeholder_1': [None, 512]},\n",
+    "#     output_arrays=['logits'],\n",
+    "# )\n",
+    "# # converter.allow_custom_ops=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# converter.experimental_new_converter = True\n",
+    "# tflite_model = converter.convert()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-float16.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-hybrid.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# interpreter = tf.lite.Interpreter(model_path='tiny-bert-sentiment-hybrid.tflite')\n",
+    "# interpreter.allocate_tensors()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/session/quantization/quantize-toxicity-model.ipynb b/session/quantization/quantize-toxicity-model.ipynb
new file mode 100644
index 00000000..44e33da9
--- /dev/null
+++ b/session/quantization/quantize-toxicity-model.ipynb
@@ -0,0 +1,394 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "S3_PATH_TOXIC = {\n",
+    "    'multinomial': {\n",
+    "        'model': 'v34/toxicity/multinomial.pkl',\n",
+    "        'vector': 'v34/toxicity/tfidf.pkl',\n",
+    "        'bpe': 'v34/toxicity/bpe.model',\n",
+    "    },\n",
+    "    'bert': {\n",
+    "        'model': 'v34/toxicity/bert-base-toxicity.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.bert.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.bert.model',\n",
+    "    },\n",
+    "    'tiny-bert': {\n",
+    "        'model': 'v34/toxicity/tiny-bert-toxicity.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.bert.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.bert.model',\n",
+    "    },\n",
+    "    'albert': {\n",
+    "        'model': 'v34/toxicity/albert-base-toxicity.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v10.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v10.model',\n",
+    "    },\n",
+    "    'tiny-albert': {\n",
+    "        'model': 'v34/toxicity/albert-tiny-toxicity.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v10.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v10.model',\n",
+    "    },\n",
+    "    'xlnet': {\n",
+    "        'model': 'v34/toxicity/xlnet-base-toxicity.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v9.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v9.model',\n",
+    "    },\n",
+    "    'alxlnet': {\n",
+    "        'model': 'v34/toxicity/alxlnet-base-toxicity.pb',\n",
+    "        'vocab': 'tokenizer/sp10m.cased.v9.vocab',\n",
+    "        'tokenizer': 'tokenizer/sp10m.cased.v9.model',\n",
+    "    },\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bert\n",
+      "tiny-bert\n",
+      "albert\n",
+      "tiny-albert\n",
+      "xlnet\n",
+      "alxlnet\n"
+     ]
+    }
+   ],
+   "source": [
+    "for k in S3_PATH_TOXIC.keys():\n",
+    "    if k != 'multinomial':\n",
+    "        print(k)\n",
+    "        os.system(f\"wget https://f000.backblazeb2.com/file/malaya-model/{S3_PATH_TOXIC[k]['model']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.tools.graph_transforms import TransformGraph\n",
+    "from glob import glob\n",
+    "tf.set_random_seed(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['tiny-bert-toxicity.pb',\n",
+       " 'alxlnet-base-toxicity.pb',\n",
+       " 'albert-tiny-toxicity.pb',\n",
+       " 'xlnet-base-toxicity.pb',\n",
+       " 'albert-base-toxicity.pb',\n",
+       " 'bert-base-toxicity.pb']"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pbs = glob('*.pb')\n",
+    "pbs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From <ipython-input-8-969a8c0cffd2>:11: FastGFile.__init__ (from tensorflow.python.platform.gfile) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "Use tf.gfile.GFile.\n",
+      "tiny-bert-toxicity.pb ['Placeholder', 'Placeholder_1']\n",
+      "alxlnet-base-toxicity.pb ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+      "albert-tiny-toxicity.pb ['Placeholder', 'Placeholder_1']\n",
+      "xlnet-base-toxicity.pb ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+      "albert-base-toxicity.pb ['Placeholder', 'Placeholder_1']\n",
+      "bert-base-toxicity.pb ['Placeholder', 'Placeholder_1']\n"
+     ]
+    }
+   ],
+   "source": [
+    "transforms = ['add_default_attributes',\n",
+    "             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n",
+    "             'fold_batch_norms',\n",
+    "             'fold_old_batch_norms',\n",
+    "             'quantize_weights(fallback_min=-10, fallback_max=10)',\n",
+    "             'strip_unused_nodes',\n",
+    "             'sort_by_execution_order']\n",
+    "\n",
+    "for pb in pbs:\n",
+    "    input_graph_def = tf.GraphDef()\n",
+    "    with tf.gfile.FastGFile(pb, 'rb') as f:\n",
+    "        input_graph_def.ParseFromString(f.read())\n",
+    "    \n",
+    "    if 'bert' in pb:\n",
+    "        inputs = ['Placeholder', 'Placeholder_1']\n",
+    "        outputs = ['dense/BiasAdd']\n",
+    "        \n",
+    "    if 'xlnet'in pb:\n",
+    "        inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n",
+    "        outputs = ['transpose_3']\n",
+    "        \n",
+    "    print(pb, inputs)\n",
+    "    \n",
+    "    transformed_graph_def = TransformGraph(input_graph_def, \n",
+    "                                           inputs,\n",
+    "                                           ['logits', 'logits_seq'] + outputs, transforms)\n",
+    "    \n",
+    "    with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n",
+    "        f.write(transformed_graph_def.SerializeToString())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# def load_graph(frozen_graph_filename, **kwargs):\n",
+    "#     with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:\n",
+    "#         graph_def = tf.GraphDef()\n",
+    "#         graph_def.ParseFromString(f.read())\n",
+    "\n",
+    "#     # https://github.com/onnx/tensorflow-onnx/issues/77#issuecomment-445066091\n",
+    "#     # to fix import T5\n",
+    "#     for node in graph_def.node:\n",
+    "#         if node.op == 'RefSwitch':\n",
+    "#             node.op = 'Switch'\n",
+    "#             for index in xrange(len(node.input)):\n",
+    "#                 if 'moving_' in node.input[index]:\n",
+    "#                     node.input[index] = node.input[index] + '/read'\n",
+    "#         elif node.op == 'AssignSub':\n",
+    "#             node.op = 'Sub'\n",
+    "#             if 'use_locking' in node.attr:\n",
+    "#                 del node.attr['use_locking']\n",
+    "#         elif node.op == 'AssignAdd':\n",
+    "#             node.op = 'Add'\n",
+    "#             if 'use_locking' in node.attr:\n",
+    "#                 del node.attr['use_locking']\n",
+    "#         elif node.op == 'Assign':\n",
+    "#             node.op = 'Identity'\n",
+    "#             if 'use_locking' in node.attr:\n",
+    "#                 del node.attr['use_locking']\n",
+    "#             if 'validate_shape' in node.attr:\n",
+    "#                 del node.attr['validate_shape']\n",
+    "#             if len(node.input) == 2:\n",
+    "#                 node.input[0] = node.input[1]\n",
+    "#                 del node.input[1]\n",
+    "\n",
+    "#     with tf.Graph().as_default() as graph:\n",
+    "#         tf.import_graph_def(graph_def)\n",
+    "#     return graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# g = load_graph('xlnet-base-emotion.pb.quantized')\n",
+    "# x = g.get_tensor_by_name('import/Placeholder:0')\n",
+    "# x_len = g.get_tensor_by_name('import/Placeholder_1:0')\n",
+    "# x_len2 = g.get_tensor_by_name('import/Placeholder_2:0')\n",
+    "# logits = g.get_tensor_by_name('import/logits:0')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# x, x_len, logits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# test_sess = tf.InteractiveSession(graph = g)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%time\n",
+    "# test_sess.run(logits, feed_dict = {x: [[1,2,3,3,4]], x_len: [[1,1,1,1,1]],\n",
+    "#                                   x_len2: [[1,1,1,1,1]]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%time\n",
+    "# test_sess.run(logits, feed_dict = {x: [[1,2,3,3,4]], x_len: [[1,1,1,1,1]]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['tiny-bert-toxicity.pb.quantized',\n",
+       " 'bert-base-toxicity.pb.quantized',\n",
+       " 'alxlnet-base-toxicity.pb.quantized',\n",
+       " 'albert-tiny-toxicity.pb.quantized',\n",
+       " 'xlnet-base-toxicity.pb.quantized',\n",
+       " 'albert-base-toxicity.pb.quantized']"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "quantized = glob('*.pb.quantized')\n",
+    "quantized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm *.pb*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(\n",
+    "#     graph_def_file='test.pb',\n",
+    "#     input_arrays=['Placeholder', 'Placeholder_1'],\n",
+    "#     input_shapes={'Placeholder' : [None, 512], 'Placeholder_1': [None, 512]},\n",
+    "#     output_arrays=['logits'],\n",
+    "# )\n",
+    "# # converter.allow_custom_ops=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# converter.experimental_new_converter = True\n",
+    "# tflite_model = converter.convert()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-float16.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-hybrid.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# interpreter = tf.lite.Interpreter(model_path='tiny-bert-sentiment-hybrid.tflite')\n",
+    "# interpreter.allocate_tensors()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/session/quantization/quantize-true-case-model.ipynb b/session/quantization/quantize-true-case-model.ipynb
new file mode 100644
index 00000000..a71fa319
--- /dev/null
+++ b/session/quantization/quantize-true-case-model.ipynb
@@ -0,0 +1,265 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2020-11-12 14:47:18--  https://f000.backblazeb2.com/file/malaya-model/v39/true-case/base.pb\n",
+      "Resolving f000.backblazeb2.com (f000.backblazeb2.com)... 104.153.233.177\n",
+      "Connecting to f000.backblazeb2.com (f000.backblazeb2.com)|104.153.233.177|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 \n",
+      "Length: 245299065 (234M) [application/octet-stream]\n",
+      "Saving to: ‘base.pb’\n",
+      "\n",
+      "base.pb             100%[===================>] 233.93M  13.4MB/s    in 19s     \n",
+      "\n",
+      "2020-11-12 14:47:39 (12.2 MB/s) - ‘base.pb’ saved [245299065/245299065]\n",
+      "\n",
+      "--2020-11-12 14:47:39--  https://f000.backblazeb2.com/file/malaya-model/v39/true-case/small.pb\n",
+      "Resolving f000.backblazeb2.com (f000.backblazeb2.com)... 104.153.233.177\n",
+      "Connecting to f000.backblazeb2.com (f000.backblazeb2.com)|104.153.233.177|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 \n",
+      "Length: 48764573 (47M) [application/octet-stream]\n",
+      "Saving to: ‘small.pb’\n",
+      "\n",
+      "small.pb            100%[===================>]  46.50M  13.4MB/s    in 4.5s    \n",
+      "\n",
+      "2020-11-12 14:47:46 (10.3 MB/s) - ‘small.pb’ saved [48764573/48764573]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!wget https://f000.backblazeb2.com/file/malaya-model/v39/true-case/base.pb\n",
+    "!wget https://f000.backblazeb2.com/file/malaya-model/v39/true-case/small.pb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.tools.graph_transforms import TransformGraph\n",
+    "from glob import glob\n",
+    "tf.set_random_seed(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['small.pb', 'base.pb']"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pbs = glob('*.pb')\n",
+    "pbs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow_text\n",
+    "import tf_sentencepiece"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From <ipython-input-6-ec53d8ce4e85>:11: FastGFile.__init__ (from tensorflow.python.platform.gfile) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "Use tf.gfile.GFile.\n",
+      "small.pb\n",
+      "base.pb\n"
+     ]
+    }
+   ],
+   "source": [
+    "transforms = ['add_default_attributes',\n",
+    "             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n",
+    "             'fold_batch_norms',\n",
+    "             'fold_old_batch_norms',\n",
+    "             'quantize_weights(fallback_min=-10, fallback_max=10)',\n",
+    "             'strip_unused_nodes',\n",
+    "             'sort_by_execution_order']\n",
+    "\n",
+    "for pb in pbs:\n",
+    "    input_graph_def = tf.GraphDef()\n",
+    "    with tf.gfile.FastGFile(pb, 'rb') as f:\n",
+    "        input_graph_def.ParseFromString(f.read())\n",
+    "        \n",
+    "    print(pb)\n",
+    "    \n",
+    "    transformed_graph_def = TransformGraph(input_graph_def, \n",
+    "                                           ['Placeholder'],\n",
+    "                                           ['greedy', 'beam'], transforms)\n",
+    "    \n",
+    "    with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n",
+    "        f.write(transformed_graph_def.SerializeToString())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['small.pb.quantized', 'base.pb.quantized']"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "quantized = glob('*.pb.quantized')\n",
+    "quantized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "rm: cannot remove '*.pb*': No such file or directory\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!rm *.pb*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(\n",
+    "#     graph_def_file='test.pb',\n",
+    "#     input_arrays=['Placeholder', 'Placeholder_1'],\n",
+    "#     input_shapes={'Placeholder' : [None, 512], 'Placeholder_1': [None, 512]},\n",
+    "#     output_arrays=['logits'],\n",
+    "# )\n",
+    "# # converter.allow_custom_ops=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# converter.experimental_new_converter = True\n",
+    "# tflite_model = converter.convert()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.target_spec.supported_types = [tf.float16]\n",
+    "# converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-float16.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, \n",
+    "#                                        tf.lite.OpsSet.SELECT_TF_OPS]\n",
+    "# converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
+    "# tflite_model = converter.convert()\n",
+    "\n",
+    "# with open('tiny-bert-sentiment-hybrid.tflite', 'wb') as f:\n",
+    "#     f.write(tflite_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# interpreter = tf.lite.Interpreter(model_path='tiny-bert-sentiment-hybrid.tflite')\n",
+    "# interpreter.allocate_tensors()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}