diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e5cec9d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,61 @@
+# These are some examples of commonly ignored file patterns.
+# You should customize this list as applicable to your project.
+# Learn more about .gitignore:
+#     https://www.atlassian.com/git/tutorials/saving-changes/gitignore
+
+# Node artifact files
+node_modules/
+dist/
+
+# Compiled Java class files
+*.class
+
+# Compiled Python bytecode
+*.py[cod]
+
+# Log files
+*.log
+
+# Package files
+*.jar
+
+# Maven
+target/
+dist/
+
+# JetBrains IDE
+.idea/
+
+# Unit test reports
+TEST*.xml
+
+# Generated by MacOS
+.DS_Store
+
+# Generated by Windows
+Thumbs.db
+
+# Applications
+*.app
+*.exe
+*.war
+
+# Large media files
+*.mp4
+*.tiff
+*.avi
+*.flv
+*.mov
+*.wmv
+
+*.npz
+*.txt
+*.csv
+*.config
+*.graph
+*.zip
+*.qdrep
+*.sqlite
+*.json
+*.graph
+*.log
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..57e9f12
--- /dev/null
+++ b/README.md
@@ -0,0 +1,21 @@
+# CoGNN
+
+## Experiment data
+
+Directory: data
+
+## Environment
+
+Directory: environment
+
+Read environment/README.md for more information.
+
+## Code
+
+Directory: software
+
+Read software/README.md for more information.
+
+## DOI by Zenodo
+
+TODO
diff --git a/data/Figure_10_relative_error.xlsx b/data/Figure_10_relative_error.xlsx
new file mode 100644
index 0000000..eeebe64
Binary files /dev/null and b/data/Figure_10_relative_error.xlsx differ
diff --git a/data/Figure_11_beyond_pairwise.xlsx b/data/Figure_11_beyond_pairwise.xlsx
new file mode 100644
index 0000000..06988b2
Binary files /dev/null and b/data/Figure_11_beyond_pairwise.xlsx differ
diff --git a/data/Figure_12_overhead.xlsx b/data/Figure_12_overhead.xlsx
new file mode 100644
index 0000000..21179cc
Binary files /dev/null and b/data/Figure_12_overhead.xlsx differ
diff --git a/data/Figure_2_motivation_PMC.xlsx b/data/Figure_2_motivation_PMC.xlsx
new file mode 100644
index 0000000..1a36771
Binary files /dev/null and b/data/Figure_2_motivation_PMC.xlsx differ
diff --git a/data/Figure_3_motivation_execution_time.xlsx b/data/Figure_3_motivation_execution_time.xlsx
new file mode 100644
index 0000000..a68487d
Binary files /dev/null and b/data/Figure_3_motivation_execution_time.xlsx differ
diff --git a/data/Figure_4_motivation_occupancy.xlsx b/data/Figure_4_motivation_occupancy.xlsx
new file mode 100644
index 0000000..9d53023
Binary files /dev/null and b/data/Figure_4_motivation_occupancy.xlsx differ
diff --git a/data/Figure_5_MPS_performance.xlsx b/data/Figure_5_MPS_performance.xlsx
new file mode 100644
index 0000000..502c693
Binary files /dev/null and b/data/Figure_5_MPS_performance.xlsx differ
diff --git a/data/Figure_9_JCT_CDF.xlsx b/data/Figure_9_JCT_CDF.xlsx
new file mode 100644
index 0000000..580748f
Binary files /dev/null and b/data/Figure_9_JCT_CDF.xlsx differ
diff --git a/environment/README.md b/environment/README.md
new file mode 100644
index 0000000..30d4c4f
--- /dev/null
+++ b/environment/README.md
@@ -0,0 +1,27 @@
+# Environment
+
+This folder contains the execution environment and softwares to be installed from source.
+
+## Files
+
+- execution-environment.txt: Hardware and software specifications.
+
+- pytorch: 
+    - [Github](https://github.com/pytorch/pytorch/tree/v1.8.1)
+    - [Install](https://github.com/pytorch/pytorch/tree/v1.8.1#from-source)
+
+- dgl: 
+    - [Github](https://github.com/dmlc/dgl/tree/v0.7.0)
+    - [DGL Install](https://docs.dgl.ai/en/0.7.x/install/)
+
+- pytorch\_sparse:
+    - [Github](https://github.com/rusty1s/pytorch_sparse/tree/0.6.10)
+    - Install: python setup.py install
+
+- pytorch\_scatter:
+    - [Github](https://github.com/rusty1s/pytorch_scatter/tree/2.0.7)
+    - Install: python setup.py install
+
+- pytorch\_geometric:
+    - [Github](https://github.com/pyg-team/pytorch_geometric/tree/1.7.0)
+    - Install: python setup.py install
diff --git a/software/README.md b/software/README.md
new file mode 100644
index 0000000..eaceb5e
--- /dev/null
+++ b/software/README.md
@@ -0,0 +1,17 @@
+# CoGNN
+
+This folder contains the system prototype of `CoGNN` and other comparison methods. We run each method `10` times and present the average results to isolate the effects of randomness. 
+
+## Environment
+
+Add the path to this folder to `PYTHONPATH`.
+
+## Usage
+
+- Compile PyTorch for `CoGNN`, `PipeSwitch` or `MPS`. `CoGNN` and `PipeSwitch` could use the same modified PyTorch.
+
+- For `CoGNN` and `PipeSwitch`, start the server first. After a few seconds, start the client to send requests.
+
+- For `MPS`, enable the MPS server and run the execution script.
+
+More details are included in README under folders for each system.
diff --git a/software/client/README.md b/software/client/README.md
new file mode 100644
index 0000000..5ae9a34
--- /dev/null
+++ b/software/client/README.md
@@ -0,0 +1,15 @@
+# Client
+
+This folder contains client code. The client submits training tasks to `CoGNN` or `PipeSwitch` server.
+
+## Files
+
+- client.py: send training request and wait for the reply.
+
+- run\_client.sh: specify model list file and execute client code.
+
+## Usage
+
+```
+./run_client.sh
+```
diff --git a/software/client/client.py b/software/client/client.py
new file mode 100644
index 0000000..d39bf01
--- /dev/null
+++ b/software/client/client.py
@@ -0,0 +1,91 @@
+import sys
+import time
+import struct
+import statistics
+import argparse
+
+from util.util import TcpClient, timestamp
+
+def send_request(client, task_name, data, layers):
+    timestamp('client', 'before_request_%s' % task_name)
+
+    # Serialize task name
+    task_name_b = task_name.encode()
+    task_name_length = len(task_name_b)
+    task_name_length_b = struct.pack('I', task_name_length)
+
+    # Serialize data
+    data_b = data.encode()
+    data_length = len(data_b)
+    data_length_b = struct.pack('I', data_length)
+
+    # Serialize number of layers
+    layers_b = layers.encode()
+    layers_length = len(layers_b)
+    layers_length_b = struct.pack('I', layers_length)
+
+    timestamp('client', 'after_serialization')
+
+    # Send task name / data
+    client.send(task_name_length_b)
+    client.send(task_name_b)
+    client.send(data_length_b)
+    client.send(data_b)
+    client.send(layers_length_b)
+    client.send(layers_b)
+
+    timestamp('client', 'after_request_%s' % task_name)
+
+
+def recv_response(client):
+    reply_b = client.recv(4)
+    reply = reply_b.decode()
+    timestamp('client', 'after_reply')
+
+
+def close_connection(client):
+    model_name_length = 0
+    model_name_length_b = struct.pack('I', model_name_length)
+    client.send(model_name_length_b)
+    timestamp('client', 'close_connection')
+
+
+def main():
+    # Load model list (task & data)
+    model_list_file_name = sys.argv[1]
+    model_list = []
+    with open(model_list_file_name) as f:
+        for line in f.readlines():
+            if len(line.split()) != 3:
+                continue
+            model_list.append([line.split()[0], line.split()[1], line.split()[2]])
+    print(model_list)
+
+    # Send training request
+    client_train = []
+    for i in range(len(model_list)):
+        client_train.append(TcpClient('localhost', 12345))
+        send_request(client_train[i], model_list[i][0], model_list[i][1], model_list[i][2])
+
+#    time.sleep(4)
+
+    timestamp('client', 'after_connect')
+    time_1 = time.time()
+
+    # Recv training reply
+    for i in range(len(model_list)):
+        recv_response(client_train[i])
+
+    time_2 = time.time()
+    duration = (time_2 - time_1) * 1000
+    
+    # Close connection
+    for i in range(len(model_list)):
+        close_connection(client_train[i])
+
+    time.sleep(1)
+    timestamp('**********', '**********')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/software/client/run_client.sh b/software/client/run_client.sh
new file mode 100755
index 0000000..6500596
--- /dev/null
+++ b/software/client/run_client.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+CUDA_VISIBLE_DEVICES=0 python client.py ../data/model/mix_model.txt
diff --git a/software/cognn/README.md b/software/cognn/README.md
new file mode 100644
index 0000000..8de25bd
--- /dev/null
+++ b/software/cognn/README.md
@@ -0,0 +1,27 @@
+# MPS
+
+This folder contains `CoGNN` code.
+
+## Files
+
+- main.py: accept connections from the client, create worker and scheduler processes.
+
+- frontend\_tcp.py: accept client requests and send replies.
+
+- frontend\_schedule.py: manage and schedule tasks according to the specified policy.
+
+- worker.py: dispatch the tasks to worker threads and recycle results.
+
+- worker\_common.py: add hooks for parameter transfering, attach the model to CUDA stream and execute it.
+
+- policy.py: functions for scheduling policies.
+
+- run\_server.sh: specify the model list to load and enable `CoGNN` server.
+
+## Usage
+
+```
+./run_server.sh
+```
+
+After enabling the server, wait seconds to load models before sending the training request.
diff --git a/software/cognn/frontend_schedule.py b/software/cognn/frontend_schedule.py
new file mode 100644
index 0000000..bf62211
--- /dev/null
+++ b/software/cognn/frontend_schedule.py
@@ -0,0 +1,146 @@
+import threading
+import torch
+import importlib
+import time
+
+from util.util import timestamp
+from cognn.policy import *
+
+class FrontendScheduleThd(threading.Thread):
+    def __init__(self, model_list, qin, worker_list):
+        super(FrontendScheduleThd, self).__init__()
+        self.model_list = model_list
+        self.qin = qin
+        self.worker_list = worker_list
+        self.cur_w_idx = 0
+
+    def run(self):
+        timestamp('schedule', 'start')
+
+        # Load models
+        models = {}
+        para_bytes = {}
+        comp_bytes = {}
+        
+        for model_name in self.model_list:
+            hash_name = hash('{}_{}_{}'.format(model_name[0], model_name[1], int(model_name[2])))
+            models[hash_name], para_bytes[hash_name], comp_bytes[hash_name] = self._load_model(model_name)
+
+        timestamp('schedule', 'load_model')
+
+        # Create CUDA stream
+        cuda_stream_for_parameter = torch.cuda.Stream()
+        timestamp('schedule', 'create_stream')
+        
+        job_list = []
+        while True:
+            # Get request           
+            agent, task_name, data_name, num_layers = self.qin.get()
+            job_list.append([agent, task_name, data_name, num_layers])
+            timestamp('schedule', 'get_request')
+            if len(job_list) == len(models):
+                break
+        
+        # schedule according to policies
+        reorder_job_list = []
+        if POLICY_OPTION == 0: # in-order maxinum memory consumption
+            reorder_job_list = policy_base(len(self.worker_list), para_bytes, comp_bytes, job_list)
+        elif POLICY_OPTION == 1: # lowest memory consumption first
+            reorder_job_list = policy_smcf(len(self.worker_list), para_bytes, comp_bytes, job_list)
+        elif POLICY_OPTION == 2: # balanced memory consumption
+            reorder_job_list = policy_bmc(len(self.worker_list), para_bytes, comp_bytes, job_list)           
+        else:  print('Wrong scheduling option!')
+
+#        for group_iter in range(len(reorder_job_list)):
+#            print('num of elements: {}'.format(len(reorder_job_list[group_iter])))
+#        print('after here')
+
+        for group_iter in range(len(reorder_job_list)):
+            para_cache_offset, comp_cache_offset = 0, MEMORY_ALLOCATED # initialize offset of parameters and computations
+            for job in reorder_job_list[group_iter]:
+                agent, task_name, data_name, num_layers = job[0], job[1], job[2], job[3]
+                # get next worker to work on request
+                self.cur_w_idx %= len(self.worker_list)
+                new_pipe, _, param_trans_pipe_parent = self.worker_list[self.cur_w_idx]
+                self.cur_w_idx += 1
+
+                # send request to new worker
+                model_name = []
+                for model in self.model_list:
+                    if model[0] == task_name and model[1] == data_name and model[2] == num_layers:
+                        model_name = model
+
+                para_cache_size = para_bytes[hash('{}_{}_{}'.format(task_name, data_name, num_layers))]
+                para_cache_info = [para_cache_size, para_cache_offset]
+
+                comp_cache_size = comp_bytes[hash('{}_{}_{}'.format(task_name, data_name, num_layers))]
+                comp_cache_offset -= comp_cache_size # update comp cache offset
+                comp_cache_info = [comp_cache_size, comp_cache_offset]
+
+                new_pipe.send((agent, model_name, para_cache_info, comp_cache_info))
+                timestamp('schedule', 'notify_new_worker')
+
+                # allocate cache to streams
+                num_layers = int(model_name[2])                
+                timestamp('schedule', 'insert_cache')
+                with torch.cuda.stream(cuda_stream_for_parameter):
+                    torch.cuda.insert_shared_cache_for_computation(para_cache_size, para_cache_offset)
+
+                para_cache_offset += para_cache_size # update para cache offset
+
+                # transfer parameters to gpu
+                batched_parameter_list = models[hash('{}_{}_{}'.format(task_name, data_name, num_layers))]
+
+                self._transfer_parameter(new_pipe,
+                                         batched_parameter_list, 
+                                         cuda_stream_for_parameter,
+                                         param_trans_pipe_parent)
+                timestamp('schedule', 'transfer_parameters')
+
+                # clear status
+                with torch.cuda.stream(cuda_stream_for_parameter):
+                    torch.cuda.clear_shared_cache() # pylint: disable=no-member
+                timestamp('schedule', 'clear_status')
+
+            res_counter = 0
+            while True:
+                self.cur_w_idx %= len(self.worker_list)
+                new_pipe, _, _ = self.worker_list[self.cur_w_idx]
+                self.cur_w_idx += 1
+                # Recv response
+                if new_pipe.poll():
+                    res = new_pipe.recv()
+                    res_counter += 1
+                if res_counter == len(reorder_job_list[group_iter]):
+                    break
+
+    def _load_model(self, model_name):
+        # Import parameters
+        model_module = importlib.import_module('task.' + model_name[0])
+        batched_parameter_list, comp_total_bytes = model_module.import_parameters(model_name[1], int(model_name[2]))
+        # Preprocess batches
+        processed_batched_parameter_list = []
+        batched_parameter_total_bytes = 0
+        for param, mod_list in batched_parameter_list:
+            if param is None:
+                processed_batched_parameter_list.append((None, mod_list))
+            else:
+                processed_batched_parameter_list.append((param.pin_memory(), mod_list))
+                batched_parameter_total_bytes += param.element_size() * param.nelement()
+
+        return processed_batched_parameter_list, batched_parameter_total_bytes, comp_total_bytes
+
+    def _transfer_parameter(self, pipe, 
+                            batched_parameter_list, 
+                            cuda_stream_for_parameter,
+                            param_trans_pipe):
+        param_cuda_list = []
+        for param, mod_list in batched_parameter_list:
+            with torch.cuda.stream(cuda_stream_for_parameter):
+                if param is not None:
+                    param_cuda = param.cuda(non_blocking=True)
+                    param_cuda_list.append(param_cuda)
+                    e = torch.cuda.Event()
+                    e.record()
+                    e.synchronize()
+                param_trans_pipe.send(mod_list[0])
diff --git a/software/cognn/frontend_tcp.py b/software/cognn/frontend_tcp.py
new file mode 100644
index 0000000..0402f8d
--- /dev/null
+++ b/software/cognn/frontend_tcp.py
@@ -0,0 +1,37 @@
+import threading
+import struct
+
+from util.util import timestamp
+
+class FrontendTcpThd(threading.Thread):
+    def __init__(self, qout, agent):
+        super(FrontendTcpThd, self).__init__()
+        self.qout = qout
+        self.agent = agent
+
+    def run(self):
+        while True:
+            timestamp('tcp', 'listening')
+            
+            task_name_length_b = self.agent.recv(4)
+            task_name_length = struct.unpack('I', task_name_length_b)[0]
+            if task_name_length == 0:
+                break
+            task_name_b = self.agent.recv(task_name_length)
+            task_name = task_name_b.decode()
+            timestamp('tcp', 'get_task_name')
+
+            data_length_b = self.agent.recv(4)
+            data_length = struct.unpack('I', data_length_b)[0]
+            data_b = self.agent.recv(data_length)
+            data_name = data_b.decode()
+            timestamp('tcp', 'get_data_name')
+
+            layer_length_b = self.agent.recv(4)
+            layer_length = struct.unpack('I', layer_length_b)[0]
+            layer_b = self.agent.recv(layer_length)
+            num_layers = layer_b.decode()
+            timestamp('tcp', 'get_num_layers')
+
+            self.qout.put((self.agent, task_name, data_name, num_layers))
+            timestamp('tcp', 'enqueue_request')
diff --git a/software/cognn/main.py b/software/cognn/main.py
new file mode 100644
index 0000000..c61763d
--- /dev/null
+++ b/software/cognn/main.py
@@ -0,0 +1,64 @@
+import sys
+import time
+from queue import Queue
+
+import torch
+import torch.multiprocessing as mp
+
+from cognn.frontend_tcp import FrontendTcpThd
+from cognn.frontend_schedule import FrontendScheduleThd
+from cognn.worker import WorkerProc
+from util.util import timestamp, TcpAgent, TcpServer
+
+def main():
+    timestamp('frontend', 'start')
+
+    # Load model list (task & data)
+    model_list_file_name = sys.argv[1]
+    num_workers = int(sys.argv[2])
+    model_list = []
+    with open(model_list_file_name) as f:
+        for line in f.readlines():
+            if len(line.split()) != 3:
+                continue
+            model_list.append([line.split()[0], line.split()[1], line.split()[2]])
+
+    # Warm up CUDA and allocate shared cache
+    torch.randn(1024, device='cuda')
+    torch.cuda.allocate_shared_cache()
+
+    # Create workers
+    worker_list = []
+    for _ in range(num_workers):
+        p_parent, p_child = mp.Pipe()
+        param_trans_parent, param_trans_child = mp.Pipe()
+        worker = WorkerProc(model_list, p_child, param_trans_child)
+        worker.start()
+        torch.cuda.send_shared_cache()
+        worker_list.append((p_parent, worker, param_trans_parent))
+        timestamp('frontend', 'create_worker')
+
+
+    # Create request queue and scheduler thread
+    requests_queue = Queue()
+    t_sch = FrontendScheduleThd(model_list, requests_queue, worker_list)
+    t_sch.start()
+    timestamp('frontend', 'start_schedule')
+
+    # Accept connections
+    server = TcpServer('localhost', 12345)
+    timestamp('tcp', 'listen')
+    while True:
+        conn, _ = server.accept()
+        agent = TcpAgent(conn)
+        timestamp('tcp', 'connected')
+        t_tcp = FrontendTcpThd(requests_queue, agent)
+        t_tcp.start()
+
+    # Wait for end
+    t_sch.join()
+
+
+if __name__ == '__main__':
+    mp.set_start_method('spawn')
+    main()
diff --git a/software/cognn/policy.py b/software/cognn/policy.py
new file mode 100644
index 0000000..e6258a7
--- /dev/null
+++ b/software/cognn/policy.py
@@ -0,0 +1,74 @@
+import numpy as np
+import collections
+
+MEMORY_ALLOCATED = 26 * 1024 * 1024 * 1024
+POLICY_OPTION = 0 # 0 BASE (in-order-maximum-memory-consumption), 1 LMCF (lowest-memory-consumption-first), 2 BMC (balanced-memory-consumption)
+
+
+# in-order maximum memory consumption
+def policy_base(num_workers, para_bytes, comp_bytes, job_list):
+    reorder_job_list = []
+    group_mc, group_counter, element_counter = 0, -1, num_workers
+    for i in range(len(job_list)):
+        hash_name = hash('{}_{}_{}'.format(job_list[i][1], job_list[i][2], job_list[i][3]))
+        total_bytes = comp_bytes[hash_name] + para_bytes[hash_name]
+        group_mc += total_bytes
+        if group_mc >= MEMORY_ALLOCATED or element_counter == num_workers:
+            reorder_job_list.append([])
+            group_mc, element_counter = total_bytes, 0
+            group_counter += 1
+        reorder_job_list[group_counter].append(job_list[i])
+        element_counter += 1
+
+    return reorder_job_list
+
+
+# lowest memory consumption first
+def policy_smcf(num_workers, para_bytes, comp_bytes, job_list):
+    total_bytes = np.zeros((len(job_list)), dtype='int64')
+    for i in range(len(job_list)):
+        hash_name = hash('{}_{}_{}'.format(job_list[i][1], job_list[i][2], job_list[i][3]))
+        total_bytes[i] = comp_bytes[hash_name] + para_bytes[hash_name]
+    ascending = np.argsort(total_bytes)
+   
+    reorder_job_list = []
+    group_mc, group_counter, element_counter = 0, -1, num_workers
+    for i in range(ascending.shape[0]): 
+        group_mc += total_bytes[ascending[i]]
+        if group_mc >= MEMORY_ALLOCATED or element_counter == num_workers:
+            reorder_job_list.append([])
+            group_mc, element_counter = total_bytes[ascending[i]], 0
+            group_counter += 1
+
+        reorder_job_list[group_counter].append(job_list[ascending[i]])
+        element_counter += 1
+
+    return reorder_job_list
+
+
+# balanced memory consumption
+def policy_bmc(num_workers, para_bytes, comp_bytes, job_list):
+    total_bytes = np.zeros((len(job_list)), dtype='int64')
+    for i in range(len(job_list)):
+        hash_name = hash('{}_{}_{}'.format(job_list[i][1], job_list[i][2], job_list[i][3]))
+        total_bytes[i] = comp_bytes[hash_name] + para_bytes[hash_name]
+    ascending = np.argsort(total_bytes)
+
+    ascDeque = collections.deque()
+    for i in range(ascending.shape[0]):
+        ascDeque.append(ascending[i])
+    
+    reorder_job_list = []
+    group_mc, group_counter, element_counter = 0, -1, num_workers
+    for i in range(ascending.shape[0]):
+        tmpIdx = ascDeque.pop() if i % 2 == 1 else ascDeque.popleft()
+        group_mc += total_bytes[tmpIdx]
+        if group_mc >= MEMORY_ALLOCATED or element_counter == num_workers:
+            reorder_job_list.append([])
+            group_mc, element_counter = total_bytes[tmpIdx], 0
+            group_counter += 1
+
+        reorder_job_list[group_counter].append(job_list[tmpIdx])
+        element_counter += 1
+
+    return reorder_job_list
diff --git a/software/cognn/run_server.sh b/software/cognn/run_server.sh
new file mode 100755
index 0000000..9b03b35
--- /dev/null
+++ b/software/cognn/run_server.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+worker=2
+CUDA_VISIBLE_DEVICES=0 python main.py ../data/model/mix_model.txt ${worker}
diff --git a/software/cognn/worker.py b/software/cognn/worker.py
new file mode 100644
index 0000000..e6a5c94
--- /dev/null
+++ b/software/cognn/worker.py
@@ -0,0 +1,43 @@
+from queue import Queue
+from multiprocessing import Process
+import torch
+import time
+
+from cognn.worker_common import ModelSummary
+from util.util import timestamp
+
+class WorkerProc(Process):
+    def __init__(self, model_list, pipe, param_trans_pipe):
+        super(WorkerProc, self).__init__()
+        self.model_list = model_list
+        self.pipe = pipe
+        self.param_trans_pipe = param_trans_pipe
+        
+    def run(self):
+        timestamp('worker', 'start')
+
+        # Warm up CUDA and get shared cache
+        torch.randn(1024, device='cuda')
+        time.sleep(1)
+        torch.cuda.recv_shared_cache() # pylint: disable=no-member
+        timestamp('worker', 'share_gpu_memory')
+        
+        while True:  # dispatch workers for task execution
+            agent, model_name, para_cache_info, comp_cache_info = self.pipe.recv()
+            model_summary = ModelSummary(model_name, para_cache_info, comp_cache_info, self.param_trans_pipe)
+            timestamp('worker', 'import models')
+            timestamp('worker', 'after importing models')
+
+            # start doing training
+            with torch.cuda.stream(model_summary.cuda_stream_for_computation):
+                output = model_summary.execute()
+                print('output: {}'.format(output))
+#                print ('Training time: {} ms'.format(output))
+                del output
+
+                self.pipe.send('FNSH')
+                agent.send(b'FNSH')
+                torch.cuda.clear_shared_cache()
+
+            timestamp('worker_comp_thd', 'complete')
+            model_summary.reset_initialized(model_summary.model)
diff --git a/software/cognn/worker_common.py b/software/cognn/worker_common.py
new file mode 100644
index 0000000..762d669
--- /dev/null
+++ b/software/cognn/worker_common.py
@@ -0,0 +1,64 @@
+import importlib
+
+import torch
+
+### Class
+class ModelSummary():
+    def __init__(self, model_name, para_cache_info, comp_cache_info, param_trans_pipe):
+        """ """
+        self.task_name, self.data_name, self.num_layers = model_name[0], model_name[1], int(model_name[2])
+        self.para_cache_size, self.para_cache_offset = para_cache_info[0], para_cache_info[1]
+        self.comp_cache_size, self.comp_cache_offset = comp_cache_info[0], comp_cache_info[1]
+        self.param_trans_pipe = param_trans_pipe
+        self.load_model()
+
+    def execute(self): 
+        return self.func(self.model, self.data)
+
+    def reset_initialized(self, mod):
+        if hasattr(mod, 'initialized'):
+            mod.initialized = False
+        for child in mod.children():
+            self.reset_initialized(child)
+
+    def insert_lock_hook(self, shape_summary_list): 
+        """ """
+        for _, _, _, mod_sublist in shape_summary_list:
+            mod = mod_sublist[0]
+            mod.initialized = False
+            def hook_wait_for_parameter_lock(mod, input):
+                if not mod.initialized:
+                    complete_name = self.param_trans_pipe.recv()
+                    if complete_name != mod.fullname:
+                        raise Exception('Invalid complete trans')
+                    mod.initialized = True
+            mod.register_forward_pre_hook(hook_wait_for_parameter_lock)
+
+    def load_model(self):
+        model_module = importlib.import_module('task.' + self.task_name)
+        self.model, self.func, self.shape_summary_list = model_module.import_task(self.data_name, self.num_layers)
+        _, self.data = model_module.import_model(self.data_name, self.num_layers)
+        # Eliminate parameters and buffers
+        self.reset_initialized(self.model)
+
+        # Insert locks for waiting parameters and add pre_forward_hook to wait for locks
+        self.insert_lock_hook(self.shape_summary_list)
+
+        # Allocate fake memory for parameters
+        self.cuda_stream_for_parameter = torch.cuda.Stream()
+        self.cuda_stream_for_computation = torch.cuda.Stream()
+        
+        print('{} {} {}'.format(self.task_name, self.data_name, self.num_layers))   
+        with torch.cuda.stream(self.cuda_stream_for_parameter):
+            torch.cuda.insert_shared_cache_for_computation(self.para_cache_size, self.para_cache_offset)
+        
+        with torch.cuda.stream(self.cuda_stream_for_computation):
+            torch.cuda.insert_shared_cache_for_computation(self.comp_cache_size, self.comp_cache_offset)
+
+        with torch.cuda.stream(self.cuda_stream_for_parameter):
+            for shape_list, param_list, buf_list, _ in self.shape_summary_list:
+                for shape, p in zip(shape_list[:len(param_list)], param_list):
+                    p.data = torch.empty(shape, device='cuda')
+                for shape, b in zip(shape_list[len(param_list):], buf_list):
+                    mod, key = b
+                    mod._buffers[key] = torch.empty(shape, device='cuda')
diff --git a/software/data/README.md b/software/data/README.md
new file mode 100644
index 0000000..8ba9708
--- /dev/null
+++ b/software/data/README.md
@@ -0,0 +1,9 @@
+# Data
+
+This folder contains model lists and graph datasets.
+
+## Graph datasets
+
+- Download the graph datasets from [Google Drive](https://drive.google.com/file/d/1Sf_l6YW0qMT0a3K0JcJ_NHDD4V7jyWds/view?usp=sharing).
+
+- Put the decompressed graph datasets to the `graph` directory.
diff --git a/software/mps/CUDACachingAllocator.cpp b/software/mps/CUDACachingAllocator.cpp
new file mode 100644
index 0000000..5236e0d
--- /dev/null
+++ b/software/mps/CUDACachingAllocator.cpp
@@ -0,0 +1,1142 @@
+#include <c10/cuda/CUDACachingAllocator.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/util/UniqueVoidPtr.h>
+
+#include <cuda_runtime_api.h>
+#include <algorithm>
+#include <bitset>
+#include <deque>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace c10 {
+
+C10_DEFINE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
+
+namespace cuda {
+namespace CUDACachingAllocator {
+//
+// Yet another caching allocator for CUDA device allocations.
+//
+// - Allocations are associated with a stream. Once freed, blocks can be
+//   re-allocated on the same stream, but not on any other stream.
+// - The allocator attempts to find the smallest cached block that will fit the
+//   requested size. If the block is larger than the requested size, it may be
+//   split. If no block is found, the allocator will delegate to cudaMalloc.
+// - If the cudaMalloc fails, the allocator will free all cached blocks that
+//   are not split and retry the allocation.
+// - Large (>1MB) and small allocations are stored in separate pools.
+//   Small requests are packed into 2MB buffers. Large requests will use the
+//   smallest available free block or allocate a new block using cudaMalloc.
+//   To reduce fragmentation, requests between 1MB and 10MB will allocate and
+//   split a 20MB block, if no free block of sufficient size is available.
+//
+// With this allocator, allocations and frees should logically be considered
+// "usages" of the memory segment associated with streams, just like kernel
+// launches. The programmer must insert the proper synchronization if memory
+// segments are used from multiple streams.
+//
+// The library provides a recordStream() function to help insert the correct
+// synchronization when allocations are used on multiple streams. This will
+// ensure that the block is not reused before each recorded stream completes
+// work.
+//
+
+
+namespace {
+
+using stream_set = std::unordered_set<cuda::CUDAStream>;
+
+constexpr size_t kMinBlockSize  =      512; // all sizes are rounded to at least 512 bytes
+constexpr size_t kSmallSize     =  1048576; // largest "small" allocation is 1 MiB
+constexpr size_t kSmallBuffer   =  2097152; // "small" allocations are packed in 2 MiB blocks
+constexpr size_t kLargeBuffer   = 20971520; // "large" allocations may be packed in 20 MiB blocks
+constexpr size_t kMinLargeAlloc = 10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
+constexpr size_t kRoundLarge    =  2097152; // round up large allocations to 2 MiB
+
+typedef std::bitset<static_cast<size_t>(StatType::NUM_TYPES)> StatTypes;
+
+void update_stat(Stat& stat, int64_t amount) {
+  stat.current += amount;
+
+  TORCH_INTERNAL_ASSERT(stat.current >= 0, "Negative tracked stat in CUDA allocator (likely logic error).");
+
+  stat.peak = std::max(stat.current, stat.peak);
+  if (amount > 0) {
+    stat.allocated += amount;
+  }
+  if (amount < 0) {
+    stat.freed += -amount;
+  }
+}
+
+void reset_accumulated_stat(Stat& stat) {
+  stat.allocated = 0;
+  stat.freed = 0;
+}
+
+void reset_peak_stat(Stat& stat) {
+  stat.peak = stat.current;
+}
+
+void update_stat_array(StatArray& stat_array, int64_t amount, const StatTypes& stat_types) {
+  for (size_t stat_type = 0; stat_type < stat_types.size(); ++stat_type) {
+    if (stat_types[stat_type]) {
+      update_stat(stat_array[stat_type], amount);
+    }
+  }
+}
+
+struct Block;
+typedef bool (*Comparison)(const Block*, const Block*);
+typedef std::set<Block*, Comparison> BlockPool;
+
+struct Block {
+  int           device;      // gpu
+  cudaStream_t  stream;      // allocation stream
+  stream_set    stream_uses; // streams on which the block was used
+  size_t        size;        // block size in bytes
+  BlockPool*    pool;        // owning memory pool
+  void*         ptr;         // memory address
+  bool          allocated;   // in-use flag
+  Block*        prev;        // prev block if split from a larger allocation
+  Block*        next;        // next block if split from a larger allocation
+  int           event_count; // number of outstanding CUDA events
+
+  Block(int device, cudaStream_t stream, size_t size, BlockPool* pool, void* ptr) :
+    device(device), stream(stream), stream_uses(), size(size), pool(pool),
+    ptr(ptr), allocated(0), prev(nullptr), next(nullptr), event_count(0) { }
+
+  // constructor for search key
+  Block(int device, cudaStream_t stream, size_t size) :
+    device(device), stream(stream), stream_uses(), size(size), pool(nullptr),
+    ptr(nullptr), allocated(0), prev(nullptr), next(nullptr), event_count(0) { }
+
+  bool is_split() const {
+    return (prev != nullptr) || (next != nullptr);
+  }
+};
+
+static bool BlockComparator(const Block* a, const Block* b)
+{
+  if (a->stream != b->stream) {
+    return (uintptr_t)a->stream < (uintptr_t)b->stream;
+  }
+  if (a->size != b->size) {
+    return a->size < b->size;
+  }
+  return (uintptr_t)a->ptr < (uintptr_t)b->ptr;
+}
+
+static std::string format_size(uint64_t size) {
+  std::ostringstream os;
+  os.precision(2);
+  os << std::fixed;
+  if (size <= 1024) {
+    os << size << " bytes";
+  } else if (size <= 1048576) {
+    os << (size / 1024.0);
+    os << " KiB";
+  } else if (size <= 1073741824ULL) {
+    os << size / 1048576.0;
+    os << " MiB";
+  } else {
+    os << size / 1073741824.0;
+    os << " GiB";
+  }
+  return os.str();
+}
+
+struct AllocParams {
+  AllocParams(int device, size_t size, cudaStream_t stream, BlockPool* pool, size_t alloc_size,
+              DeviceStats& stats) :
+    search_key(device, stream, size),
+    pool(pool),
+    alloc_size(alloc_size),
+    block(nullptr),
+    err(cudaSuccess) {}
+
+  int device() { return search_key.device; }
+  cudaStream_t stream() { return search_key.stream; }
+  size_t size() { return search_key.size; }
+
+  Block search_key;
+  BlockPool* pool;
+  size_t alloc_size;
+  Block* block;
+  StatTypes stat_types;
+  cudaError_t err;
+};
+
+} // namespace
+
+class DeviceCachingAllocator {
+
+ private:
+
+  // lock around all operations
+  mutable std::recursive_mutex mutex;
+
+  // device statistics
+  DeviceStats stats;
+
+  // unallocated cached blocks larger than 1 MB
+  BlockPool large_blocks;
+
+  // unallocated cached blocks 1 MB or smaller
+  BlockPool small_blocks;
+
+  // allocated or in use by a stream
+  std::unordered_set<Block*> active_blocks;
+
+  // outstanding cuda events
+  std::deque<std::pair<cudaEvent_t, Block*>> cuda_events;
+
+  // record used memory.
+  size_t total_allocated_memory = 0;
+
+  size_t allowed_memory_maximum = 0;
+
+  bool set_fraction = false;
+
+ public:
+
+  DeviceCachingAllocator() :
+      large_blocks(BlockComparator),
+      small_blocks(BlockComparator) {}
+
+  // All public methods (except the above) acquire the allocator mutex.
+  // Thus, do not call a public method from another public method.
+
+  Block* malloc(int device, size_t size, cudaStream_t stream)
+  {
+    std::unique_lock<std::recursive_mutex> lock(mutex);
+
+    // process outstanding cudaEvents
+    process_events();
+
+    size = round_size(size);
+    auto& pool = get_pool(size);
+    const size_t alloc_size = get_allocation_size(size);
+    AllocParams params(device, size, stream, &pool, alloc_size, stats);
+    params.stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
+    params.stat_types[static_cast<size_t>(get_stat_type_for_pool(pool))] = true;
+
+    bool block_found =
+      // Search pool
+      get_free_block(params)
+      // Trigger callbacks and retry search
+      || (trigger_free_memory_callbacks(params) && get_free_block(params))
+      // Attempt allocate
+      || alloc_block(params, false)
+      // Free all non-split cached blocks and retry alloc.
+      || (free_cached_blocks() && alloc_block(params, true));
+
+    if (!block_found) {
+      // For any error code other than cudaErrorMemoryAllocation,
+      // alloc_block should have thrown an exception already.
+      TORCH_INTERNAL_ASSERT(params.err == cudaErrorMemoryAllocation);
+
+      size_t device_free;
+      size_t device_total;
+      C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
+      std::string allowed_info;
+
+      if (set_fraction) {
+        allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
+      }
+
+      stats.num_ooms += 1;
+
+      // "total capacity": total global memory on GPU
+      // "allowed": memory is allowed to use, which set by fraction.
+      // "already allocated": memory allocated by the program using the
+      //                      caching allocator
+      // "free": free memory as reported by the CUDA API
+      // "cached": memory held by the allocator but not used by the program
+      //
+      // The "allocated" amount  does not include memory allocated outside
+      // of the caching allocator, such as memory allocated by other programs
+      // or memory held by the driver.
+      //
+      // The sum of "allocated" + "free" + "cached" may be less than the
+      // total capacity due to memory held by the driver and usage by other
+      // programs.
+      //
+      // Note that at this point free_cached_blocks has already returned all
+      // possible "cached" memory to the driver. The only remaining "cached"
+      // memory is split from a larger block that is partially in-use.
+      TORCH_CHECK_WITH(CUDAOutOfMemoryError, false,
+        "CUDA out of memory. Tried to allocate ", format_size(alloc_size),
+        " (GPU ", device, "; ",
+        format_size(device_total), " total capacity; ",
+        format_size(stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
+        " already allocated; ",
+        format_size(device_free), " free; ",
+        allowed_info,
+        format_size(stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
+        " reserved in total by PyTorch)");
+    }
+
+    TORCH_INTERNAL_ASSERT(params.err == cudaSuccess &&
+                          params.block != nullptr &&
+                          params.block->ptr != nullptr);
+    Block* block = params.block;
+    Block* remaining = nullptr;
+
+    const bool already_split = block->is_split();
+    if (should_split(block, size)) {
+      remaining = block;
+
+      block = new Block(device, stream, size, &pool, block->ptr);
+      block->prev = remaining->prev;
+      if (block->prev) {
+        block->prev->next = block;
+      }
+      block->next = remaining;
+
+      remaining->prev = block;
+      remaining->ptr = static_cast<char*>(remaining->ptr) + size;
+      remaining->size -= size;
+      pool.insert(remaining);
+
+      if (already_split) {
+        // An already-split inactive block is being shrunk by size bytes.
+        update_stat_array(stats.inactive_split_bytes, -block->size, params.stat_types);
+      } else {
+        // A new split inactive block is being created from a previously unsplit block,
+        // size remaining->size bytes.
+        update_stat_array(stats.inactive_split_bytes, remaining->size, params.stat_types);
+        update_stat_array(stats.inactive_split, 1, params.stat_types);
+      }
+    } else if (already_split) {
+      // An already-split block is becoming active
+      update_stat_array(stats.inactive_split_bytes, -block->size, params.stat_types);
+      update_stat_array(stats.inactive_split, -1, params.stat_types);
+    }
+
+    block->allocated = true;
+    active_blocks.insert(block);
+
+    c10::reportMemoryUsageToProfiler(
+        block, block->size, c10::Device(c10::DeviceType::CUDA, device));
+
+    update_stat_array(stats.allocation, 1, params.stat_types);
+    update_stat_array(stats.allocated_bytes, block->size, params.stat_types);
+    update_stat_array(stats.active, 1, params.stat_types);
+    update_stat_array(stats.active_bytes, block->size, params.stat_types);
+
+    return block;
+  }
+
+  void free(Block* block)
+  {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+
+    block->allocated = false;
+
+    c10::reportMemoryUsageToProfiler(
+        block, -block->size, c10::Device(c10::DeviceType::CUDA, block->device));
+
+    StatTypes stat_types;
+    stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
+    stat_types[static_cast<size_t>(get_stat_type_for_pool(*(block->pool)))] = true;
+    update_stat_array(stats.allocation, -1, {stat_types});
+    update_stat_array(stats.allocated_bytes, -block->size, {stat_types});
+
+    if (!block->stream_uses.empty()) {
+      insert_events(block);
+    } else {
+      free_block(block);
+    }
+  }
+
+  void* getBaseAllocation(Block* block, size_t* outSize) {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    while (block->prev) {
+      block = block->prev;
+    }
+    void *basePtr = block->ptr;
+    if (outSize) {
+      size_t size = 0;
+      while (block) {
+        size += block->size;
+        block = block->next;
+      }
+      *outSize = size;
+    }
+    return basePtr;
+  }
+
+  void recordStream(Block* block, cuda::CUDAStream stream) {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    if (stream.stream() == block->stream) {
+      // ignore uses on the allocation stream, since those don't require any
+      // special synchronization
+      return;
+    }
+    block->stream_uses.insert(stream);
+  }
+
+  /** set memory fraction to limit maximum allocated memory **/
+  void setMemoryFraction(double fraction) {
+    size_t device_free;
+    size_t device_total;
+    C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
+    allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
+    set_fraction = true;
+  }
+
+  /** returns cached blocks to the system allocator **/
+  void emptyCache() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    free_cached_blocks();
+  }
+
+  /** Retrieves info (total size + largest block) of the memory cache **/
+  void cacheInfo(size_t* total, size_t* largest)
+  {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    if (*largest == 0) {  // make an initial guess if a zero *largest is passed in
+      size_t tmp_bytes;
+      cudaMemGetInfo(largest,  // Use free memory as an optimistic initial guess of *largest
+                     &tmp_bytes);
+    }
+    cache_info_aux(large_blocks, total, largest);
+    cache_info_aux(small_blocks, total, largest);
+  }
+
+  /** Returns a copy of the memory allocator stats **/
+  DeviceStats getStats() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    return stats;
+  }
+
+  /** Resets the historical accumulation stats for the device **/
+  void resetAccumulatedStats() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+
+    for (size_t statType = 0; statType < static_cast<size_t>(StatType::NUM_TYPES); ++statType) {
+      reset_accumulated_stat(stats.allocation[statType]);
+      reset_accumulated_stat(stats.segment[statType]);
+      reset_accumulated_stat(stats.active[statType]);
+      reset_accumulated_stat(stats.inactive_split[statType]);
+      reset_accumulated_stat(stats.allocated_bytes[statType]);
+      reset_accumulated_stat(stats.reserved_bytes[statType]);
+      reset_accumulated_stat(stats.active_bytes[statType]);
+      reset_accumulated_stat(stats.inactive_split_bytes[statType]);
+    }
+
+    stats.num_alloc_retries = 0;
+    stats.num_ooms = 0;
+  }
+
+  /** Resets the historical peak stats for the device **/
+  void resetPeakStats() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+
+    for (size_t statType = 0; statType < static_cast<size_t>(StatType::NUM_TYPES); ++statType) {
+      reset_peak_stat(stats.allocation[statType]);
+      reset_peak_stat(stats.segment[statType]);
+      reset_peak_stat(stats.active[statType]);
+      reset_peak_stat(stats.inactive_split[statType]);
+      reset_peak_stat(stats.allocated_bytes[statType]);
+      reset_peak_stat(stats.reserved_bytes[statType]);
+      reset_peak_stat(stats.active_bytes[statType]);
+      reset_peak_stat(stats.inactive_split_bytes[statType]);
+    }
+  }
+
+  /** Dump a complete snapshot of the memory held by the allocator. Potentially VERY expensive. **/
+  std::vector<SegmentInfo> snapshot() const {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+
+    std::vector<SegmentInfo> result;
+    const auto all_blocks = get_all_blocks();
+
+    for (const Block* const head_block : all_blocks) {
+      if (head_block->prev != nullptr) {
+        continue;
+      }
+      result.emplace_back();
+      SegmentInfo& segment_info = result.back();
+      segment_info.device = head_block->device;
+      segment_info.address = reinterpret_cast<int64_t>(head_block->ptr);
+      segment_info.is_large = (head_block->pool == &large_blocks);
+
+      const Block* block = head_block;
+      while (block != nullptr) {
+        segment_info.blocks.emplace_back();
+        BlockInfo& block_info = segment_info.blocks.back();
+
+        block_info.size = block->size;
+        block_info.allocated = block->allocated;
+        block_info.active = block->allocated || (block->event_count > 0);
+
+        segment_info.total_size += block_info.size;
+        if (block_info.allocated) {
+          segment_info.allocated_size += block_info.size;
+        }
+        if (block_info.active) {
+          segment_info.active_size += block_info.size;
+        }
+
+        block = block->next;
+      }
+    }
+
+    std::sort(result.begin(), result.end(), [](const SegmentInfo& a, const SegmentInfo& b) {
+      return a.address < b.address;
+    });
+
+    return result;
+  }
+
+  static size_t round_size(size_t size) {
+    if (size < kMinBlockSize) {
+      return kMinBlockSize;
+    } else {
+      return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
+    }
+  }
+
+ private:
+
+  // All private methods do not acquire the allocator mutex.
+
+  std::vector<const Block*> get_all_blocks() const {
+    std::vector<const Block*> blocks;
+    blocks.insert(blocks.end(), small_blocks.begin(), small_blocks.end());
+    blocks.insert(blocks.end(), large_blocks.begin(), large_blocks.end());
+    blocks.insert(blocks.end(), active_blocks.begin(), active_blocks.end());
+    return blocks;
+  }
+
+  /** moves a block into a pool of cached free blocks */
+  void free_block(Block* block)
+  {
+    TORCH_INTERNAL_ASSERT(!block->allocated && block->event_count == 0);
+
+    size_t original_block_size = block->size;
+
+    auto& pool = *block->pool;
+    int64_t net_change_inactive_split_blocks = 0;
+    int64_t net_change_inactive_split_size = 0;
+
+    const std::array<Block*, 2> merge_candidates = {block->prev, block->next};
+    for (Block* merge_candidate : merge_candidates) {
+      const int64_t subsumed_size = try_merge_blocks(block, merge_candidate, pool);
+      if (subsumed_size > 0) {
+        net_change_inactive_split_blocks -= 1;
+        net_change_inactive_split_size -= subsumed_size;
+      }
+    }
+
+    active_blocks.erase(block);
+    pool.insert(block);
+
+    if (block->is_split()) {
+      net_change_inactive_split_blocks += 1;
+      net_change_inactive_split_size += block->size;
+    }
+
+    StatTypes stat_types;
+    stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
+    stat_types[static_cast<size_t>(get_stat_type_for_pool(*(block->pool)))] = true;
+    update_stat_array(stats.inactive_split, net_change_inactive_split_blocks, stat_types);
+    update_stat_array(stats.inactive_split_bytes, net_change_inactive_split_size, stat_types);
+    update_stat_array(stats.active, -1, stat_types);
+    update_stat_array(stats.active_bytes, -original_block_size, stat_types);
+  }
+
+  /** combine previously split blocks. returns the size of the subsumed block, or 0 on failure. */
+  size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool)
+  {
+    if (!src || src->allocated || src->event_count > 0) {
+      return 0;
+    }
+
+    AT_ASSERT(dst->is_split() && src->is_split());
+
+    if (dst->prev == src) {
+      dst->ptr = src->ptr;
+      dst->prev = src->prev;
+      if (dst->prev) {
+        dst->prev->next = dst;
+      }
+    } else {
+      dst->next = src->next;
+      if (dst->next) {
+        dst->next->prev = dst;
+      }
+    }
+
+    const size_t subsumed_size = src->size;
+    dst->size += subsumed_size;
+    pool.erase(src);
+    delete src;
+
+    return subsumed_size;
+  }
+
+  BlockPool& get_pool(size_t size) {
+    if (size <= kSmallSize) {
+      return small_blocks;
+    } else {
+      return large_blocks;
+    }
+  }
+
+  StatType get_stat_type_for_pool(const BlockPool& pool) {
+    if (&pool == &small_blocks) {
+      return StatType::SMALL_POOL;
+    } else if (&pool == &large_blocks) {
+      return StatType::LARGE_POOL;
+    } else {
+      AT_ERROR("get_stat_type_for_pool: invalid pool");
+    }
+  }
+
+  bool should_split(const Block* block, size_t size) {
+    size_t remaining = block->size - size;
+    if (block->pool == &small_blocks) {
+      return remaining >= kMinBlockSize;
+    } else if (block->pool == &large_blocks) {
+      return remaining > kSmallSize;
+    } else {
+      AT_ERROR("should_split: invalid pool");
+    }
+  }
+
+  static size_t get_allocation_size(size_t size) {
+    if (size <= kSmallSize) {
+      return kSmallBuffer;
+    } else if (size < kMinLargeAlloc) {
+      return kLargeBuffer;
+    } else {
+      return kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
+    }
+  }
+
+  bool get_free_block(AllocParams& p) {
+    BlockPool& pool = *p.pool;
+    auto it = pool.lower_bound(&p.search_key);
+    if (it == pool.end() || (*it)->stream != p.stream())
+      return false;
+    p.block = *it;
+    pool.erase(it);
+    return true;
+  }
+
+  bool trigger_free_memory_callbacks(AllocParams& p) {
+    bool freed_memory = false;
+    for (const auto& name : FreeCudaMemoryCallbacksRegistry()->Keys()) {
+      freed_memory |=
+        FreeCudaMemoryCallbacksRegistry()->Create(name)->Execute();
+    }
+    return freed_memory;
+  }
+
+  bool alloc_block(AllocParams& p, bool isRetry) {
+    // Defensively checks for preexisting CUDA error state.
+    C10_CUDA_CHECK(cudaGetLastError());
+
+    size_t size = p.alloc_size;
+    void* ptr;
+
+    if (isRetry) {
+      stats.num_alloc_retries += 1;
+    }
+
+    if (set_fraction && total_allocated_memory + size > allowed_memory_maximum) {
+      p.err = cudaErrorMemoryAllocation;
+      return false;
+    } else {
+         p.err = cudaMallocManaged(&ptr, size);
+      if (p.err != cudaSuccess) {
+        if (p.err == cudaErrorMemoryAllocation) {
+          // If this is the first attempt (!isRetry), we can forgive and clear CUDA's
+          //   internal error state.
+          // If this is the second attempt (isRetry), malloc's TORCH_CHECK_WITH will take
+          //   over to throw a helpful exception. The user can choose to catch the exception,
+          //   free some stuff in their script, and attempt their allocation again.
+          //   In this case, we can also forgive and clear CUDA's internal error state.
+          cudaGetLastError();
+        } else {
+          // If the error's unrelated to memory allocation, we should throw immediately.
+          C10_CUDA_CHECK(p.err);
+        }
+        return false;
+      }
+    }
+
+    total_allocated_memory += size;
+    p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr);
+    update_stat_array(stats.segment, 1, p.stat_types);
+    update_stat_array(stats.reserved_bytes, size, p.stat_types);
+
+    // p.block came from new, not cudaMalloc.  It should not be nullptr here.
+    TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr);
+    return true;
+  }
+
+  bool free_cached_blocks()
+  {
+    // First ensure that all blocks that can't currently be allocated due to
+    // outstanding events are returned to the pool.
+    synchronize_and_free_events();
+
+    // Free all non-split cached blocks
+    free_blocks(large_blocks);
+    free_blocks(small_blocks);
+    return true;
+  }
+
+  void free_blocks(BlockPool& blocks)
+  {
+    // Frees all non-split blocks
+    auto it = blocks.begin();
+    while (it != blocks.end()) {
+      Block* block = *it;
+      if (!block->prev && !block->next) {
+        C10_CUDA_CHECK(cudaFree((void*)block->ptr));
+        total_allocated_memory -= block->size;
+
+        StatTypes stat_types;
+        stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
+        stat_types[static_cast<size_t>(get_stat_type_for_pool(*(block->pool)))] = true;
+        update_stat_array(stats.segment, -1, stat_types);
+        update_stat_array(stats.reserved_bytes, -block->size, stat_types);
+
+        auto cur = it;
+        ++it;
+        blocks.erase(cur);
+        delete block;
+      } else {
+        ++it;
+      }
+    }
+  }
+
+  cudaEvent_t create_event_internal() {
+    cudaEvent_t event;
+    C10_CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+    return event;
+  }
+
+  void free_event_internal(cudaEvent_t event) {
+    C10_CUDA_CHECK(cudaEventDestroy(event));
+  }
+
+  void synchronize_and_free_events() {
+    // Synchronize on outstanding events and then free associated blocks.
+
+    for (auto& e : cuda_events) {
+      cudaEvent_t event = e.first;
+      Block* block = e.second;
+
+      C10_CUDA_CHECK(cudaEventSynchronize(event));
+      free_event_internal(event);
+
+      block->event_count--;
+      if (block->event_count == 0) {
+        free_block(block);
+      }
+    }
+
+    cuda_events.clear();
+  }
+
+  void insert_events(Block* block)
+  {
+    int prev_device;
+    C10_CUDA_CHECK(cudaGetDevice(&prev_device));
+
+    stream_set streams(std::move(block->stream_uses));
+    AT_ASSERT(block->stream_uses.empty());
+    for (auto it = streams.begin(); it != streams.end(); ++it) {
+      C10_CUDA_CHECK(cudaSetDevice(it->device_index()));
+
+      cudaEvent_t event = create_event_internal();
+      C10_CUDA_CHECK(cudaEventRecord(event, it->stream()));
+
+      block->event_count++;
+      cuda_events.emplace_back(event, block);
+    }
+
+    C10_CUDA_CHECK(cudaSetDevice(prev_device));
+  }
+
+  void process_events()
+  {
+    // Process outstanding cudaEvents. Events that are completed are removed
+    // from the queue, and the 'event_count' for the corresponding allocation
+    // is decremented. Stops at the first event which has not been completed.
+    // Since events on different devices or streams may occur out of order,
+    // the processing of some events may be delayed.
+    while (!cuda_events.empty()) {
+      auto& e = cuda_events.front();
+      cudaEvent_t event = e.first;
+      Block* block = e.second;
+
+      cudaError_t err = cudaEventQuery(event);
+      if (err == cudaErrorNotReady) {
+        // ignore and clear the error if not ready
+        cudaGetLastError();
+        break;
+      } else if (err != cudaSuccess) {
+        C10_CUDA_CHECK(err);
+      }
+
+      free_event_internal(event);
+
+      block->event_count--;
+      if (block->event_count == 0) {
+        free_block(block);
+      }
+      cuda_events.pop_front();
+    }
+  }
+
+  // Accumulates sizes of all memory blocks for given device in given pool
+  void cache_info_aux(BlockPool& blocks, size_t* total, size_t* largest)
+  {
+    for (const auto& block : blocks) {
+      size_t blocksize = block->size;
+      *total += blocksize;
+      if (blocksize > *largest) {
+        *largest = blocksize;
+      }
+    }
+  }
+};
+
+class THCCachingAllocator {
+
+ private:
+
+  std::mutex mutex;
+
+  // allocated blocks by device pointer
+  std::unordered_map<void*, Block*> allocated_blocks;
+
+  // lock around calls to cudaFree (to prevent deadlocks with NCCL)
+  mutable std::mutex cuda_free_mutex;
+
+  void add_allocated_block(Block* block) {
+    std::lock_guard<std::mutex> lock(mutex);
+    allocated_blocks[block->ptr] = block;
+  }
+
+ public:
+
+  std::vector<std::unique_ptr<DeviceCachingAllocator>> device_allocator;
+
+  std::mutex* getCudaFreeMutex() const {
+    return &cuda_free_mutex;
+  }
+
+  Block* get_allocated_block(void *ptr, bool remove=false) {
+    std::lock_guard<std::mutex> lock(mutex);
+    auto it = allocated_blocks.find(ptr);
+    if (it == allocated_blocks.end()) {
+      return nullptr;
+    }
+    Block* block = it->second;
+    if (remove) {
+      allocated_blocks.erase(it);
+    }
+    return block;
+  }
+
+  void init(int device_count) {
+    int size = device_allocator.size();
+    if (size < device_count) {
+      device_allocator.resize(device_count);
+      for (int i = size; i < device_count; i++) {
+        device_allocator[i] = std::unique_ptr<DeviceCachingAllocator>(new DeviceCachingAllocator());
+      }
+    }
+  }
+
+  /** allocates a block which is safe to use from the provided stream */
+  void malloc(void** devPtr, int device, size_t size, cudaStream_t stream) {
+    TORCH_INTERNAL_ASSERT(
+        0 <= device && device < device_allocator.size(),
+        "Allocator not initialized for device ",
+        device,
+        ": did you call init?");
+    Block* block = device_allocator[device]->malloc(device, size, stream);
+    add_allocated_block(block);
+    *devPtr = (void*)block->ptr;
+  }
+
+  void free(void* ptr) {
+    if (!ptr) {
+      return;
+    }
+    Block* block = get_allocated_block(ptr, true /* remove */);
+    if (!block) {
+      AT_ERROR("invalid device pointer: ", ptr);
+    }
+    device_allocator[block->device]->free(block);
+  }
+
+  void setMemoryFraction(double fraction, int device) {
+    TORCH_INTERNAL_ASSERT(
+        0 <= device && device < device_allocator.size(),
+        "Allocator not initialized for device ",
+        device,
+        ": did you call init?");
+    TORCH_INTERNAL_ASSERT(
+        0 <= fraction  && fraction <= 1,
+        "invalid fraction:",
+        fraction,
+        ". Please set within (0, 1).");
+    int activated_device;
+    cudaGetDevice (&activated_device);
+    if (activated_device != device) {
+        cudaSetDevice(device);
+    }
+    device_allocator[device]->setMemoryFraction(fraction);
+  }
+
+  void emptyCache() {
+    int count = device_allocator.size();
+    for (int i = 0; i < count; i++)
+      device_allocator[i]->emptyCache();
+  }
+
+  void* getBaseAllocation(void* ptr, size_t* outSize)
+  {
+    Block* block = get_allocated_block(ptr);
+    if (!block) {
+      AT_ERROR("invalid device pointer: ", ptr);
+    }
+    return device_allocator[block->device]->getBaseAllocation(block, outSize);
+  }
+
+  void recordStream(const DataPtr& ptr, cuda::CUDAStream stream) {
+    // Empty tensor's storage().data() might be a null ptr. As there is no
+    // blocks associated with those tensors, it is fine to do nothing here.
+    if (!ptr.get()) {
+      return;
+    }
+
+    // If a tensor is not allocated by this instance, simply skip
+    // This usually happens when CUDA tensors are shared across processes,
+    // we have implemented reference counting based sharing mechanism to
+    // guarantee tensors won't be accidentally freed by one process while
+    // they are still being used in another
+    if (ptr.get_deleter() != &raw_delete)
+      return;
+
+    Block* block = get_allocated_block(ptr.get());
+    // block must not be null reaching here
+    TORCH_INTERNAL_ASSERT(block != nullptr, "No allocated block can be found");
+    device_allocator[block->device]->recordStream(block, stream);
+  }
+
+  std::vector<SegmentInfo> snapshot() {
+    std::vector<SegmentInfo> result;
+    int count = device_allocator.size();
+    for (int i = 0; i < count; i++) {
+      auto snap = device_allocator[i]->snapshot();
+      result.insert(result.end(), snap.begin(), snap.end());
+    }
+
+    return result;
+  }
+};
+
+THCCachingAllocator caching_allocator;
+
+// Returns whether to force all allocations to bypass the caching allocator and
+// go straight to cudaMalloc.  This setting is useful when debugging GPU memory
+// errors, since the caching allocator foils cuda-memcheck.
+bool forceUncachedAllocator() {
+  static bool force_uncached =
+      getenv("PYTORCH_NO_CUDA_MEMORY_CACHING") != nullptr;
+  return force_uncached;
+}
+
+static void uncached_delete(void* ptr) {
+  C10_CUDA_CHECK(cudaFree(ptr));
+}
+
+// NB: I decided not to fold this into THCCachingAllocator, because the latter
+// has a lot more methods and it wasn't altogether clear that they should
+// actually be publicly exposed
+struct CudaCachingAllocator : public Allocator {
+  DataPtr allocate(size_t size) const override {
+    int device;
+    C10_CUDA_CHECK(cudaGetDevice(&device));
+    void* r = nullptr;
+    if (forceUncachedAllocator()) {
+      C10_CUDA_CHECK(cudaMallocManaged(&r, size));
+      return {r, r, &uncached_delete, Device(DeviceType::CUDA, device)};
+    }
+    if (size != 0) {
+      caching_allocator.malloc(&r, device, size, cuda::getCurrentCUDAStream(device));
+    }
+    return {r, r, &raw_delete, Device(DeviceType::CUDA, device)};
+  }
+  DeleterFnPtr raw_deleter() const override {
+    return &raw_delete;
+  }
+};
+
+CudaCachingAllocator device_allocator;
+
+Allocator* get(void)
+{
+  return &device_allocator;
+}
+
+void init(int device_count) {
+  caching_allocator.init(device_count);
+}
+
+void setMemoryFraction(double fraction, int device) {
+  caching_allocator.setMemoryFraction(fraction, device);
+}
+
+void emptyCache(void) {
+  caching_allocator.emptyCache();
+}
+
+void cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock) {
+  caching_allocator.device_allocator[dev_id]->cacheInfo(cachedAndFree, largestBlock);
+}
+
+void* getBaseAllocation(void *ptr, size_t *size)
+{
+  return caching_allocator.getBaseAllocation(ptr, size);
+}
+
+void recordStream(const DataPtr& ptr, cuda::CUDAStream stream)
+{
+  caching_allocator.recordStream(ptr, stream);
+}
+
+std::mutex* getFreeMutex()
+{
+  return caching_allocator.getCudaFreeMutex();
+}
+
+static inline void assertValidDevice(int device) {
+  int device_num = caching_allocator.device_allocator.size();
+  TORCH_CHECK(0 <= device && device < device_num, "Invalid device argument.");
+}
+
+DeviceStats getDeviceStats(int device) {
+  assertValidDevice(device);
+  return caching_allocator.device_allocator[device]->getStats();
+}
+
+void resetAccumulatedStats(int device) {
+  assertValidDevice(device);
+  caching_allocator.device_allocator[device]->resetAccumulatedStats();
+}
+
+void resetPeakStats(int device) {
+  assertValidDevice(device);
+  caching_allocator.device_allocator[device]->resetPeakStats();
+}
+
+std::vector<SegmentInfo> snapshot() {
+  return caching_allocator.snapshot();
+}
+
+//
+// In CUDA IPC, sender sends a tensor to receiver, getIpcDevPtr
+// is called by the receiving process to map the CUDA memory from the sending
+// process into its own address space.
+//
+// CUDA IPC only allows sharing a big memory block associated with a cudaIpcMemHandle_t
+// and it can be opened only **once** per context per process. There can be
+// multiple types of storage in the same IPC mem block, so we must cache the
+// device ptr to construct typed storage as it comes.
+//
+// ipcMemHandle_to_devptr maps a cudaIpcMemHandle_t to a device pointer in the process
+// that can be used to access the memory block in the sender process.
+// It only saves a weak_ptr of the device pointer in the map, the shared_ptr
+// will be used to reconstruct all storages in this CudaMalloc allocation.
+// And it will deleted in cudaIpcCloseMemHandle when its reference count is 0.
+//
+namespace {
+  std::mutex IpcMutex;
+  std::unordered_map<std::string, std::weak_ptr<void>> ipcMemHandle_to_devptr;
+}
+
+std::shared_ptr<void> getIpcDevPtr(std::string handle) {
+  std::lock_guard<std::mutex> lock(IpcMutex);
+
+  auto iter = ipcMemHandle_to_devptr.find(handle);
+  if (iter != ipcMemHandle_to_devptr.end()) {
+    auto devptr = iter->second.lock();
+    if (devptr) return devptr;
+  }
+  // This ipcMemHandle hasn't been opened, or already expired, open it to
+  // enable IPC access to that mem block.
+  void *dev = nullptr;
+  auto ipc_handle = reinterpret_cast<const cudaIpcMemHandle_t*>(handle.c_str());
+  C10_CUDA_CHECK(cudaIpcOpenMemHandle(&dev, *ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+  // devPtr has to be deleted in same device when created.
+  int curr_device;
+  C10_CUDA_CHECK(cudaGetDevice(&curr_device));
+  auto sp = std::shared_ptr<void>(
+      dev,
+      [handle, curr_device](void *ptr) {
+        cuda::CUDAGuard device_guard(curr_device);
+        std::lock_guard<std::mutex> deleter_lock(IpcMutex);
+        C10_CUDA_CHECK(cudaIpcCloseMemHandle(ptr));
+        ipcMemHandle_to_devptr.erase(handle);});
+  std::weak_ptr<void> wp = sp;
+  // To eliminate an additional search, we can use insert().
+  // It doesn't overwrite when key already exists(ptr expired).
+  // But in the deleter for sp we erased the entry,
+  // this should be safe to do now.
+  ipcMemHandle_to_devptr.insert(iter, {handle, wp});
+
+  return sp;
+}
+
+void* raw_alloc(size_t nbytes) {
+  if (nbytes == 0) {
+    return nullptr;
+  }
+  int device;
+  C10_CUDA_CHECK(cudaGetDevice(&device));
+  void* r = nullptr;
+  caching_allocator.malloc(&r, device, nbytes, cuda::getCurrentCUDAStream(device));
+  return r;
+}
+
+void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) {
+  if (nbytes == 0) {
+    return nullptr;
+  }
+  int device;
+  C10_CUDA_CHECK(cudaGetDevice(&device));
+  void* r = nullptr;
+  caching_allocator.malloc(&r, device, nbytes, stream);
+  return r;
+}
+
+void raw_delete(void* ptr) {
+  caching_allocator.free(ptr);
+}
+
+} // namespace CUDACachingAllocator
+
+}} // namespace c10::cuda
diff --git a/software/mps/README.md b/software/mps/README.md
new file mode 100644
index 0000000..0432276
--- /dev/null
+++ b/software/mps/README.md
@@ -0,0 +1,22 @@
+# MPS
+
+This folder contains `MPS` code and scripts, also modification to the PyTorch code (PyTorch v1.8.1).
+
+## Files
+
+- PyTorch file
+    - CUDACachingAllocator.cpp: $PYTORCH\_PATH/c10/cuda/CUDACachingAllocator.cpp
+
+- script
+    - enable\_mps.sh: set GPU to EXCLUSIVE mode and start MPS service.
+    - stop\_mps.sh: quit MPS service and set GPU to DEFAULT mode.
+
+- mps\_main.py: execute one training task.
+
+- run\_mps.sh: execute pairwise training tasks in order. 
+
+## Usage
+
+```
+./run_mps.sh
+```
diff --git a/software/mps/mps_main.py b/software/mps/mps_main.py
new file mode 100644
index 0000000..e725051
--- /dev/null
+++ b/software/mps/mps_main.py
@@ -0,0 +1,34 @@
+import sys
+import time
+import importlib
+
+import torch
+
+def read_list(model_list_file_name):
+    model_list = []
+    with open(model_list_file_name) as f:
+        for line in f.readlines():
+            if len(line.split()) != 3:
+                continue
+            model_list.append([line.split()[0], line.split()[1], line.split()[2]])
+    return model_list
+
+
+def main():
+    # Load model list (task & data)
+    model_list = read_list(sys.argv[1])
+    model_id = int(sys.argv[2])
+
+    task_name = model_list[model_id][0]
+    data_name = model_list[model_id][1]
+    num_layers = int(model_list[model_id][2])
+
+    model_module = importlib.import_module('task.' + task_name)
+    model, func, _ = model_module.import_task(data_name, num_layers)
+    _, data = model_module.import_model(data_name, num_layers)
+    output = func(model, data)
+#    print('Training time: {} ms'.format(output))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/software/mps/overwrite.sh b/software/mps/overwrite.sh
new file mode 100755
index 0000000..68d605e
--- /dev/null
+++ b/software/mps/overwrite.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+PYTORCH_PATH=$1
+cp CUDACachingAllocator.cpp $PYTORCH_PATH/c10/cuda/CUDACachingAllocator.cpp
diff --git a/software/mps/run_mps.sh b/software/mps/run_mps.sh
new file mode 100755
index 0000000..fa2bbfd
--- /dev/null
+++ b/software/mps/run_mps.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+network=("GCN" "GraphSAGE" "GAT" "GIN" "mix")
+for i in `seq 0 9`;
+do
+    while true
+    do
+        process=`ps aux | grep mps_main | grep -v grep`
+        if [ "$process" == "" ]; then
+            taskId=`expr $i \* 2`    
+            CUDA_VISIBLE_DEVICES=0 python mps_main.py ../data/model/${network[4]}_model.txt ${taskId} &
+            taskId=`expr ${taskId} + 1`
+            CUDA_VISIBLE_DEVICES=0 python mps_main.py ../data/model/${network[4]}_model.txt ${taskId}
+            break
+        fi
+    done
+done
diff --git a/software/mps/script/enable_mps.sh b/software/mps/script/enable_mps.sh
new file mode 100755
index 0000000..4b7f52b
--- /dev/null
+++ b/software/mps/script/enable_mps.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+#export CUDA_VISIBLE_DEVICES="1"
+nvidia-smi -i 0 -c EXCLUSIVE_PROCESS
+nvidia-cuda-mps-control -d
diff --git a/software/mps/script/stop_mps.sh b/software/mps/script/stop_mps.sh
new file mode 100755
index 0000000..678adb4
--- /dev/null
+++ b/software/mps/script/stop_mps.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+echo quit | nvidia-cuda-mps-control
+nvidia-smi -i 0 -c DEFAULT
diff --git a/software/pipeswitch/README.md b/software/pipeswitch/README.md
new file mode 100644
index 0000000..0d8f663
--- /dev/null
+++ b/software/pipeswitch/README.md
@@ -0,0 +1,17 @@
+# PipeSwitch
+
+This folder contains the modified `PipeSwitch` for comparison. 
+
+## Files
+
+- run\_server.sh: specify the model list to load and enable `PipeSwitch` server.
+
+- others: manage and execute training models in a time-sharing manner.
+
+## Usage
+
+```
+./run_server.sh
+```
+
+After enabling the server, wait seconds to load models before sending the training request.
diff --git a/software/pipeswitch/frontend_schedule.py b/software/pipeswitch/frontend_schedule.py
new file mode 100644
index 0000000..9e9a074
--- /dev/null
+++ b/software/pipeswitch/frontend_schedule.py
@@ -0,0 +1,124 @@
+import threading
+import torch
+import importlib
+import time
+
+from util.util import timestamp
+from cognn.policy import *
+
+class FrontendScheduleThd(threading.Thread):
+    def __init__(self, model_list, qin, worker_list):
+        super(FrontendScheduleThd, self).__init__()
+        self.model_list = model_list
+        self.qin = qin
+        self.worker_list = worker_list
+        self.cur_w_idx = 0
+
+    def run(self):
+        timestamp('schedule', 'start')
+
+        # Load models
+        models = {}
+        para_bytes = {}
+        comp_bytes = {}
+        
+        for model_name in self.model_list:
+            hash_name = hash('{}_{}_{}'.format(model_name[0], model_name[1], int(model_name[2])))
+            models[hash_name], _, _ = self._load_model(model_name)
+
+        timestamp('schedule', 'load_model')
+
+        # Create CUDA stream
+        cuda_stream_for_parameter = torch.cuda.Stream()
+        timestamp('schedule', 'create_stream')
+        
+        job_list = []
+        while True:
+            # Get request           
+            agent, task_name, data_name, num_layers = self.qin.get()
+            job_list.append([agent, task_name, data_name, num_layers])
+            timestamp('schedule', 'get_request')
+            if len(job_list) == len(models):
+                break
+        
+#        for group_iter in range(len(reorder_job_list)):
+#            print('num of elements: {}'.format(len(reorder_job_list[group_iter])))
+#        print('after here')
+        para_cache_size, comp_cache_size = 1024 * 1024 * 1024, 25 * 1024 * 1024 * 1024
+        for group_iter in range(len(reorder_job_list)):
+            for job in reorder_job_list[group_iter]:
+                agent, task_name, data_name, num_layers = job[0], job[1], job[2], job[3]
+                # get next worker to work on request
+                self.cur_w_idx %= len(self.worker_list)
+                new_pipe, _, param_trans_pipe_parent, _ = self.worker_list[self.cur_w_idx]
+                self.cur_w_idx += 1
+
+                # send request to new worker
+                model_name = []
+                for model in self.model_list:
+                    if model[0] == task_name and model[1] == data_name and model[2] == num_layers:
+                        model_name = model
+                new_pipe.send((agent, model_name, para_cache_size, comp_cache_size))
+                timestamp('schedule', 'notify_new_worker')
+
+                # allocate cache to streams
+                num_layers = int(model_name[2])                
+                timestamp('schedule', 'insert_cache')
+                with torch.cuda.stream(cuda_stream_for_parameter):
+                    torch.cuda.insert_shared_cache_for_computation(para_cache_size, 0)
+
+                # transfer parameters to gpu
+                batched_parameter_list = models[hash('{}_{}_{}'.format(task_name, data_name, num_layers))]
+                self._transfer_parameter(new_pipe,
+                                         batched_parameter_list, 
+                                         cuda_stream_for_parameter,
+                                         param_trans_pipe_parent)
+                timestamp('schedule', 'transfer_parameters')
+
+                # clear status
+                with torch.cuda.stream(cuda_stream_for_parameter):
+                    torch.cuda.clear_shared_cache() # pylint: disable=no-member
+                timestamp('schedule', 'clear_status')
+
+            res_counter = 0
+            while True:
+                self.cur_w_idx %= len(self.worker_list)
+                new_pipe, _, _, _ = self.worker_list[self.cur_w_idx]
+                self.cur_w_idx += 1
+                # Recv response
+                if new_pipe.poll():
+                    res = new_pipe.recv()
+                    res_counter += 1
+                if res_counter == len(reorder_job_list[group_iter]):
+                    break
+
+    def _load_model(self, model_name):
+        # Import parameters
+        model_module = importlib.import_module('task.' + model_name[0])
+        batched_parameter_list, comp_total_bytes = model_module.import_parameters(model_name[1], int(model_name[2]))
+        # Preprocess batches
+        processed_batched_parameter_list = []
+        batched_parameter_total_bytes = 0
+        for param, mod_list in batched_parameter_list:
+            if param is None:
+                processed_batched_parameter_list.append((None, mod_list))
+            else:
+                processed_batched_parameter_list.append((param.pin_memory(), mod_list))
+                batched_parameter_total_bytes += param.element_size() * param.nelement()
+
+        return processed_batched_parameter_list, batched_parameter_total_bytes, comp_total_bytes
+
+    def _transfer_parameter(self, pipe, 
+                            batched_parameter_list, 
+                            cuda_stream_for_parameter,
+                            param_trans_pipe):
+        param_cuda_list = []
+        for param, mod_list in batched_parameter_list:
+            with torch.cuda.stream(cuda_stream_for_parameter):
+                if param is not None:
+                    param_cuda = param.cuda(non_blocking=True)
+                    param_cuda_list.append(param_cuda)
+                    e = torch.cuda.Event()
+                    e.record()
+                    e.synchronize()
+                param_trans_pipe.send(mod_list[0])
diff --git a/software/pipeswitch/frontend_tcp.py b/software/pipeswitch/frontend_tcp.py
new file mode 100644
index 0000000..0402f8d
--- /dev/null
+++ b/software/pipeswitch/frontend_tcp.py
@@ -0,0 +1,37 @@
+import threading
+import struct
+
+from util.util import timestamp
+
+class FrontendTcpThd(threading.Thread):
+    def __init__(self, qout, agent):
+        super(FrontendTcpThd, self).__init__()
+        self.qout = qout
+        self.agent = agent
+
+    def run(self):
+        while True:
+            timestamp('tcp', 'listening')
+            
+            task_name_length_b = self.agent.recv(4)
+            task_name_length = struct.unpack('I', task_name_length_b)[0]
+            if task_name_length == 0:
+                break
+            task_name_b = self.agent.recv(task_name_length)
+            task_name = task_name_b.decode()
+            timestamp('tcp', 'get_task_name')
+
+            data_length_b = self.agent.recv(4)
+            data_length = struct.unpack('I', data_length_b)[0]
+            data_b = self.agent.recv(data_length)
+            data_name = data_b.decode()
+            timestamp('tcp', 'get_data_name')
+
+            layer_length_b = self.agent.recv(4)
+            layer_length = struct.unpack('I', layer_length_b)[0]
+            layer_b = self.agent.recv(layer_length)
+            num_layers = layer_b.decode()
+            timestamp('tcp', 'get_num_layers')
+
+            self.qout.put((self.agent, task_name, data_name, num_layers))
+            timestamp('tcp', 'enqueue_request')
diff --git a/software/pipeswitch/main.py b/software/pipeswitch/main.py
new file mode 100644
index 0000000..ba551fe
--- /dev/null
+++ b/software/pipeswitch/main.py
@@ -0,0 +1,64 @@
+import sys
+import time
+from queue import Queue
+
+import torch
+import torch.multiprocessing as mp
+
+from cognn.frontend_tcp import FrontendTcpThd
+from cognn.frontend_schedule import FrontendScheduleThd
+from cognn.worker import WorkerProc
+from util.util import timestamp, TcpAgent, TcpServer
+
+def main():
+    timestamp('frontend', 'start')
+
+    # Load model list (task & data)
+    model_list_file_name = sys.argv[1]
+    model_list = []
+    with open(model_list_file_name) as f:
+        for line in f.readlines():
+            if len(line.split()) != 3:
+                continue
+            model_list.append([line.split()[0], line.split()[1], line.split()[2]])
+
+    # Warm up CUDA and allocate shared cache
+    torch.randn(1024, device='cuda')
+    torch.cuda.allocate_shared_cache()
+
+    # Create workers
+    num_workers = 1
+    worker_list = []
+    for _ in range(num_workers):
+        p_parent, p_child = mp.Pipe()
+        param_trans_parent, param_trans_child = mp.Pipe()
+        worker = WorkerProc(model_list, p_child, param_trans_child)
+        worker.start()
+        torch.cuda.send_shared_cache()
+        worker_list.append((p_parent, worker, param_trans_parent))
+        timestamp('frontend', 'create_worker')
+
+
+    # Create request queue and scheduler thread
+    requests_queue = Queue()
+    t_sch = FrontendScheduleThd(model_list, requests_queue, worker_list)
+    t_sch.start()
+    timestamp('frontend', 'start_schedule')
+
+    # Accept connections
+    server = TcpServer('localhost', 12345)
+    timestamp('tcp', 'listen')
+    while True:
+        conn, _ = server.accept()
+        agent = TcpAgent(conn)
+        timestamp('tcp', 'connected')
+        t_tcp = FrontendTcpThd(requests_queue, agent)
+        t_tcp.start()
+
+    # Wait for end
+    t_sch.join()
+
+
+if __name__ == '__main__':
+    mp.set_start_method('spawn')
+    main()
diff --git a/software/pipeswitch/run_server.sh b/software/pipeswitch/run_server.sh
new file mode 100755
index 0000000..b98db88
--- /dev/null
+++ b/software/pipeswitch/run_server.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+CUDA_VISIBLE_DEVICES=0 python main.py ../data/model/mix_model.txt
diff --git a/software/pipeswitch/worker.py b/software/pipeswitch/worker.py
new file mode 100644
index 0000000..776f594
--- /dev/null
+++ b/software/pipeswitch/worker.py
@@ -0,0 +1,45 @@
+from queue import Queue
+from multiprocessing import Process
+import torch
+import time
+
+from cognn.worker_common import ModelSummary
+from util.util import timestamp
+
+class WorkerProc(Process):
+    def __init__(self, model_list, pipe, param_trans_pipe):
+        super(WorkerProc, self).__init__()
+        self.model_list = model_list
+        self.pipe = pipe
+        self.param_trans_pipe = param_trans_pipe
+        
+    def run(self):
+        timestamp('worker', 'start')
+
+        # Warm up CUDA and get shared cache
+        torch.randn(1024, device='cuda')
+        time.sleep(1)
+        torch.cuda.recv_shared_cache() # pylint: disable=no-member
+        timestamp('worker', 'share_gpu_memory')
+        
+        while True:  # dispatch workers for task execution
+            agent, model_name, para_cache_info, comp_cache_info = self.pipe.recv()
+            model_summary = ModelSummary(model_name, para_cache_size, comp_cache_size, self.param_trans_pipe)
+            timestamp('worker', 'import models')
+            timestamp('worker', 'after importing models')
+
+            # start doing training
+            try:
+                with torch.cuda.stream(model_summary.cuda_stream_for_computation):
+                    output = model_summary.execute()
+                    print('output: {}'.format(output))
+#                    print ('Training time: {} ms'.format(output))
+                    del output
+
+                    self.pipe.send('FNSH')
+                    agent.send(b'FNSH')
+
+            except Exception as e:
+                complete_queue.put('FNSH')
+            timestamp('worker_comp_thd', 'complete')
+            model_summary.reset_initialized(model_summary.model)
diff --git a/software/pipeswitch/worker_common.py b/software/pipeswitch/worker_common.py
new file mode 100644
index 0000000..5e43ce3
--- /dev/null
+++ b/software/pipeswitch/worker_common.py
@@ -0,0 +1,63 @@
+import importlib
+
+import torch
+
+### Class
+class ModelSummary():
+    def __init__(self, model_name, para_cache_size, comp_cache_size, param_trans_pipe):
+        """ """
+        self.task_name, self.data_name, self.num_layers = model_name[0], model_name[1], int(model_name[2])
+        self.para_cache_size, self.comp_cache_size = para_cache_size, comp_cache_size
+        self.param_trans_pipe = param_trans_pipe
+        self.load_model()
+
+    def execute(self): 
+        return self.func(self.model, self.data)
+
+    def reset_initialized(self, mod):
+        if hasattr(mod, 'initialized'):
+            mod.initialized = False
+        for child in mod.children():
+            self.reset_initialized(child)
+
+    def insert_lock_hook(self, shape_summary_list): 
+        """ """
+        for _, _, _, mod_sublist in shape_summary_list:
+            mod = mod_sublist[0]
+            mod.initialized = False
+            def hook_wait_for_parameter_lock(mod, input):
+                if not mod.initialized:
+                    complete_name = self.param_trans_pipe.recv()
+                    if complete_name != mod.fullname:
+                        raise Exception('Invalid complete trans')
+                    mod.initialized = True
+            mod.register_forward_pre_hook(hook_wait_for_parameter_lock)
+
+    def load_model(self):
+        model_module = importlib.import_module('task.' + self.task_name)
+        self.model, self.func, self.shape_summary_list = model_module.import_task(self.data_name, self.num_layers)
+        _, self.data = model_module.import_model(self.data_name, self.num_layers)
+        # Eliminate parameters and buffers
+        self.reset_initialized(self.model)
+
+        # Insert locks for waiting parameters and add pre_forward_hook to wait for locks
+        self.insert_lock_hook(self.shape_summary_list)
+
+        # Allocate fake memory for parameters
+        self.cuda_stream_for_parameter = torch.cuda.Stream()
+        self.cuda_stream_for_computation = torch.cuda.Stream()
+        
+        print('{} {} {}'.format(self.task_name, self.data_name, self.num_layers))   
+        with torch.cuda.stream(self.cuda_stream_for_parameter):
+            torch.cuda.insert_shared_cache_for_computation(self.para_cache_size, 0)
+        
+        with torch.cuda.stream(self.cuda_stream_for_computation):
+            torch.cuda.insert_shared_cache_for_computation(self.comp_cache_size, self.para_cache_size)
+
+        with torch.cuda.stream(self.cuda_stream_for_parameter):
+            for shape_list, param_list, buf_list, _ in self.shape_summary_list:
+                for shape, p in zip(shape_list[:len(param_list)], param_list):
+                    p.data = torch.empty(shape, device='cuda')
+                for shape, b in zip(shape_list[len(param_list):], buf_list):
+                    mod, key = b
+                    mod._buffers[key] = torch.empty(shape, device='cuda')
diff --git a/software/plugin/CUDACachingAllocator.cpp b/software/plugin/CUDACachingAllocator.cpp
new file mode 100644
index 0000000..884c55e
--- /dev/null
+++ b/software/plugin/CUDACachingAllocator.cpp
@@ -0,0 +1,1367 @@
+#include <c10/cuda/CUDACachingAllocator.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/util/UniqueVoidPtr.h>
+
+#include <cuda_runtime_api.h>
+#include <algorithm>
+#include <bitset>
+#include <deque>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include <iostream> // CoGNN
+#include <unistd.h>  // CoGNN
+#include <stdio.h>  // CoGNN
+#include <sys/socket.h>  // CoGNN
+#include <stdlib.h>  // CoGNN
+#include <netinet/in.h>  // CoGNN
+#include <string.h> // CoGNN
+#include <arpa/inet.h>  // CoGNN
+#define PORT 9010 // CoGNN
+#define SIZE_SHARED_CACHE (26 * 1024UL * 1024UL * 1024UL) // CoGNN
+
+namespace c10 {
+
+C10_DEFINE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
+
+
+namespace cuda {
+namespace CUDACachingAllocator {
+
+//
+// Yet another caching allocator for CUDA device allocations.
+//
+// - Allocations are associated with a stream. Once freed, blocks can be
+//   re-allocated on the same stream, but not on any other stream.
+// - The allocator attempts to find the smallest cached block that will fit the
+//   requested size. If the block is larger than the requested size, it may be
+//   split. If no block is found, the allocator will delegate to cudaMalloc.
+// - If the cudaMalloc fails, the allocator will free all cached blocks that
+//   are not split and retry the allocation.
+// - Large (>1MB) and small allocations are stored in separate pools.
+//   Small requests are packed into 2MB buffers. Large requests will use the
+//   smallest available free block or allocate a new block using cudaMalloc.
+//   To reduce fragmentation, requests between 1MB and 10MB will allocate and
+//   split a 20MB block, if no free block of sufficient size is available.
+//
+// With this allocator, allocations and frees should logically be considered
+// "usages" of the memory segment associated with streams, just like kernel
+// launches. The programmer must insert the proper synchronization if memory
+// segments are used from multiple streams.
+//
+// The library provides a recordStream() function to help insert the correct
+// synchronization when allocations are used on multiple streams. This will
+// ensure that the block is not reused before each recorded stream completes
+// work.
+//
+
+
+namespace {
+
+using stream_set = std::unordered_set<cuda::CUDAStream>;
+
+constexpr size_t kMinBlockSize  =      512; // all sizes are rounded to at least 512 bytes
+constexpr size_t kSmallSize     =  1048576; // largest "small" allocation is 1 MiB
+constexpr size_t kSmallBuffer   =  2097152; // "small" allocations are packed in 2 MiB blocks
+constexpr size_t kLargeBuffer   = 20971520; // "large" allocations may be packed in 20 MiB blocks
+constexpr size_t kMinLargeAlloc = 10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
+constexpr size_t kRoundLarge    =  2097152; // round up large allocations to 2 MiB
+
+typedef std::bitset<static_cast<size_t>(StatType::NUM_TYPES)> StatTypes;
+
+void update_stat(Stat& stat, int64_t amount) {
+  stat.current += amount;
+
+  TORCH_INTERNAL_ASSERT(stat.current >= 0, "Negative tracked stat in CUDA allocator (likely logic error).");
+
+  stat.peak = std::max(stat.current, stat.peak);
+  if (amount > 0) {
+    stat.allocated += amount;
+  }
+  if (amount < 0) {
+    stat.freed += -amount;
+  }
+}
+
+void reset_accumulated_stat(Stat& stat) {
+  stat.allocated = 0;
+  stat.freed = 0;
+}
+
+void reset_peak_stat(Stat& stat) {
+  stat.peak = stat.current;
+}
+
+void update_stat_array(StatArray& stat_array, int64_t amount, const StatTypes& stat_types) {
+  for (size_t stat_type = 0; stat_type < stat_types.size(); ++stat_type) {
+    if (stat_types[stat_type]) {
+      update_stat(stat_array[stat_type], amount);
+    }
+  }
+}
+
+struct Block;
+typedef bool (*Comparison)(const Block*, const Block*);
+typedef std::set<Block*, Comparison> BlockPool;
+
+struct Block {
+  int           device;      // gpu
+  cudaStream_t  stream;      // allocation stream
+  stream_set    stream_uses; // streams on which the block was used
+  size_t        size;        // block size in bytes
+  BlockPool*    pool;        // owning memory pool
+  void*         ptr;         // memory address
+  bool          allocated;   // in-use flag
+  Block*        prev;        // prev block if split from a larger allocation
+  Block*        next;        // next block if split from a larger allocation
+  int           event_count; // number of outstanding CUDA events
+
+  Block(int device, cudaStream_t stream, size_t size, BlockPool* pool, void* ptr) :
+    device(device), stream(stream), stream_uses(), size(size), pool(pool),
+    ptr(ptr), allocated(0), prev(nullptr), next(nullptr), event_count(0) { }
+
+  // constructor for search key
+  Block(int device, cudaStream_t stream, size_t size) :
+    device(device), stream(stream), stream_uses(), size(size), pool(nullptr),
+    ptr(nullptr), allocated(0), prev(nullptr), next(nullptr), event_count(0) { }
+
+  bool is_split() const {
+    return (prev != nullptr) || (next != nullptr);
+  }
+};
+
+static bool BlockComparator(const Block* a, const Block* b)
+{
+  if (a->stream != b->stream) {
+    return (uintptr_t)a->stream < (uintptr_t)b->stream;
+  }
+  if (a->size != b->size) {
+    return a->size < b->size;
+  }
+  return (uintptr_t)a->ptr < (uintptr_t)b->ptr;
+}
+
+static std::string format_size(uint64_t size) {
+  std::ostringstream os;
+  os.precision(2);
+  os << std::fixed;
+  if (size <= 1024) {
+    os << size << " bytes";
+  } else if (size <= 1048576) {
+    os << (size / 1024.0);
+    os << " KiB";
+  } else if (size <= 1073741824ULL) {
+    os << size / 1048576.0;
+    os << " MiB";
+  } else {
+    os << size / 1073741824.0;
+    os << " GiB";
+  }
+  return os.str();
+}
+
+struct AllocParams {
+  AllocParams(int device, size_t size, cudaStream_t stream, BlockPool* pool, size_t alloc_size,
+              DeviceStats& stats) :
+    search_key(device, stream, size),
+    pool(pool),
+    alloc_size(alloc_size),
+    block(nullptr),
+    err(cudaSuccess) {}
+
+  int device() { return search_key.device; }
+  cudaStream_t stream() { return search_key.stream; }
+  size_t size() { return search_key.size; }
+
+  Block search_key;
+  BlockPool* pool;
+  size_t alloc_size;
+  Block* block;
+  StatTypes stat_types;
+  cudaError_t err;
+};
+
+} // namespace
+
+class DeviceCachingAllocator {
+
+ private:
+
+  // lock around all operations
+  mutable std::recursive_mutex mutex;
+
+  // device statistics
+  DeviceStats stats;
+
+  // unallocated cached blocks larger than 1 MB
+  BlockPool large_blocks;
+
+  // unallocated cached blocks 1 MB or smaller
+  BlockPool small_blocks;
+
+  // allocated or in use by a stream
+  std::unordered_set<Block*> active_blocks;
+
+  // outstanding cuda events
+  std::deque<std::pair<cudaEvent_t, Block*>> cuda_events;
+
+  // CoGNN: Pre-allocated shared GPU memory
+  void* COGNN_shared_ptr = NULL;
+
+  // record used memory.
+  size_t total_allocated_memory = 0;
+
+  size_t allowed_memory_maximum = 0;
+
+  bool set_fraction = false;
+
+ public:
+
+  DeviceCachingAllocator() :
+      large_blocks(BlockComparator),
+      small_blocks(BlockComparator) {}
+
+  // All public methods (except the above) acquire the allocator mutex.
+  // Thus, do not call a public method from another public method.
+
+  Block* malloc(int device, size_t size, cudaStream_t stream)
+  {
+    std::unique_lock<std::recursive_mutex> lock(mutex);
+
+    // process outstanding cudaEvents
+    process_events();
+
+    size = round_size(size);
+    auto& pool = get_pool(size);
+    const size_t alloc_size = get_allocation_size(size);
+    AllocParams params(device, size, stream, &pool, alloc_size, stats);
+    params.stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
+    params.stat_types[static_cast<size_t>(get_stat_type_for_pool(pool))] = true;
+
+    bool block_found =
+      // Search pool
+      get_free_block(params)
+      // Trigger callbacks and retry search
+      || (trigger_free_memory_callbacks(params) && get_free_block(params))
+      // Attempt allocate
+      || alloc_block(params, false)
+      // Free all non-split cached blocks and retry alloc.
+      || (free_cached_blocks() && alloc_block(params, true));
+
+    if (!block_found) {
+      // For any error code other than cudaErrorMemoryAllocation,
+      // alloc_block should have thrown an exception already.
+      TORCH_INTERNAL_ASSERT(params.err == cudaErrorMemoryAllocation);
+
+      size_t device_free;
+      size_t device_total;
+      C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
+      std::string allowed_info;
+
+      if (set_fraction) {
+        allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
+      }
+
+      stats.num_ooms += 1;
+
+      // "total capacity": total global memory on GPU
+      // "allowed": memory is allowed to use, which set by fraction.
+      // "already allocated": memory allocated by the program using the
+      //                      caching allocator
+      // "free": free memory as reported by the CUDA API
+      // "cached": memory held by the allocator but not used by the program
+      //
+      // The "allocated" amount  does not include memory allocated outside
+      // of the caching allocator, such as memory allocated by other programs
+      // or memory held by the driver.
+      //
+      // The sum of "allocated" + "free" + "cached" may be less than the
+      // total capacity due to memory held by the driver and usage by other
+      // programs.
+      //
+      // Note that at this point free_cached_blocks has already returned all
+      // possible "cached" memory to the driver. The only remaining "cached"
+      // memory is split from a larger block that is partially in-use.
+      TORCH_CHECK_WITH(CUDAOutOfMemoryError, false,
+        "CUDA out of memory. Tried to allocate ", format_size(alloc_size),
+        " (GPU ", device, "; ",
+        format_size(device_total), " total capacity; ",
+        format_size(stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
+        " already allocated; ",
+        format_size(device_free), " free; ",
+        allowed_info,
+        format_size(stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
+        " reserved in total by PyTorch)");
+    }
+
+    TORCH_INTERNAL_ASSERT(params.err == cudaSuccess &&
+                          params.block != nullptr &&
+                          params.block->ptr != nullptr);
+    Block* block = params.block;
+    Block* remaining = nullptr;
+
+    const bool already_split = block->is_split();
+    if (should_split(block, size)) {
+      remaining = block;
+
+      block = new Block(device, stream, size, &pool, block->ptr);
+      block->prev = remaining->prev;
+      if (block->prev) {
+        block->prev->next = block;
+      }
+      block->next = remaining;
+
+      remaining->prev = block;
+      remaining->ptr = static_cast<char*>(remaining->ptr) + size;
+      remaining->size -= size;
+      pool.insert(remaining);
+
+      if (already_split) {
+        // An already-split inactive block is being shrunk by size bytes.
+        update_stat_array(stats.inactive_split_bytes, -block->size, params.stat_types);
+      } else {
+        // A new split inactive block is being created from a previously unsplit block,
+        // size remaining->size bytes.
+        update_stat_array(stats.inactive_split_bytes, remaining->size, params.stat_types);
+        update_stat_array(stats.inactive_split, 1, params.stat_types);
+      }
+    } else if (already_split) {
+      // An already-split block is becoming active
+      update_stat_array(stats.inactive_split_bytes, -block->size, params.stat_types);
+      update_stat_array(stats.inactive_split, -1, params.stat_types);
+    }
+
+    block->allocated = true;
+    active_blocks.insert(block);
+
+    c10::reportMemoryUsageToProfiler(
+        block, block->size, c10::Device(c10::DeviceType::CUDA, device));
+
+    update_stat_array(stats.allocation, 1, params.stat_types);
+    update_stat_array(stats.allocated_bytes, block->size, params.stat_types);
+    update_stat_array(stats.active, 1, params.stat_types);
+    update_stat_array(stats.active_bytes, block->size, params.stat_types);
+
+    return block;
+  }
+
+  void free(Block* block)
+  {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+
+    block->allocated = false;
+
+    c10::reportMemoryUsageToProfiler(
+        block, -block->size, c10::Device(c10::DeviceType::CUDA, block->device));
+
+    StatTypes stat_types;
+    stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
+    stat_types[static_cast<size_t>(get_stat_type_for_pool(*(block->pool)))] = true;
+    update_stat_array(stats.allocation, -1, {stat_types});
+    update_stat_array(stats.allocated_bytes, -block->size, {stat_types});
+
+    if (!block->stream_uses.empty()) {
+      insert_events(block);
+    } else {
+      free_block(block);
+    }
+  }
+
+  void* getBaseAllocation(Block* block, size_t* outSize) {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    while (block->prev) {
+      block = block->prev;
+    }
+    void *basePtr = block->ptr;
+    if (outSize) {
+      size_t size = 0;
+      while (block) {
+        size += block->size;
+        block = block->next;
+      }
+      *outSize = size;
+    }
+    return basePtr;
+  }
+
+  void recordStream(Block* block, cuda::CUDAStream stream) {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    if (stream.stream() == block->stream) {
+      // ignore uses on the allocation stream, since those don't require any
+      // special synchronization
+      return;
+    }
+    block->stream_uses.insert(stream);
+  }
+
+  /** set memory fraction to limit maximum allocated memory **/
+  void setMemoryFraction(double fraction) {
+    size_t device_free;
+    size_t device_total;
+    C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
+    allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
+    set_fraction = true;
+  }
+
+  /** returns cached blocks to the system allocator **/
+  void emptyCache() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    free_cached_blocks();
+  }
+
+  /* CoGNN: allocate shared GPU memory */
+  void allocateSharedCache() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    cudaError_t err = cudaMalloc(&COGNN_shared_ptr, SIZE_SHARED_CACHE);
+    if (err != cudaSuccess) {
+      perror("allocate_shared_cache"); 
+      exit(EXIT_FAILURE); 
+    }
+  }
+
+  /* CoGNN: send shared GPU memory */
+  void sendSharedCache() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    cudaIpcMemHandle_t shared_cache_handle;
+
+    // Pack CUDA pointer
+    cudaError_t err = cudaIpcGetMemHandle(&shared_cache_handle, COGNN_shared_ptr);
+    if (err != cudaSuccess) {
+      perror("pack_shared_cache"); 
+      exit(EXIT_FAILURE); 
+    }
+
+    // Accept connection
+    int server_fd, conn_fd, valread; 
+    int opt = 1;
+    struct sockaddr_in address; 
+    int addrlen = sizeof(address);
+    if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0) { 
+        perror("socket failed"); 
+        exit(EXIT_FAILURE); 
+    } 
+    if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt))) { 
+        perror("setsockopt"); 
+        exit(EXIT_FAILURE); 
+    } 
+    address.sin_family = AF_INET; 
+    address.sin_addr.s_addr = INADDR_ANY; 
+    address.sin_port = htons( PORT ); 
+    if (bind(server_fd, (struct sockaddr *)&address,  sizeof(address)) < 0) { 
+        perror("bind failed"); 
+        exit(EXIT_FAILURE); 
+    } 
+    if (listen(server_fd, 1) < 0) { 
+        perror("listen"); 
+        exit(EXIT_FAILURE); 
+    } 
+    if ((conn_fd = accept(server_fd, (struct sockaddr *)&address, (socklen_t*)&addrlen)) < 0) { 
+        perror("accept"); 
+        exit(EXIT_FAILURE); 
+    }
+
+    // Send the packed pointer
+    write(conn_fd, (void*)(&shared_cache_handle), sizeof(cudaIpcMemHandle_t));
+
+    close(conn_fd);
+    close(server_fd);
+  }
+
+  /* CoGNN: recv shared GPU memory */
+  void recvSharedCache() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    cudaIpcMemHandle_t shared_cache_handle;
+
+    // Connect
+    int conn_fd = 0; 
+    struct sockaddr_in serv_addr; 
+    if ((conn_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { 
+        printf("\n Socket creation error \n"); 
+        exit(EXIT_FAILURE); 
+    }
+    serv_addr.sin_family = AF_INET; 
+    serv_addr.sin_port = htons(PORT); 
+    if(inet_pton(AF_INET, "127.0.0.1", &serv_addr.sin_addr) <= 0) { 
+        printf("\nInvalid address/ Address not supported \n"); 
+        exit(EXIT_FAILURE); 
+    }
+    if (connect(conn_fd, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) { 
+        printf("\nConnection Failed \n"); 
+        exit(EXIT_FAILURE); 
+    }
+
+    // Receive packed pointer
+    read(conn_fd, (void*)(&shared_cache_handle), sizeof(cudaIpcMemHandle_t)); 
+    
+    // Extract the pointer
+    cudaError_t err = cudaIpcOpenMemHandle(&COGNN_shared_ptr, shared_cache_handle, cudaIpcMemLazyEnablePeerAccess);
+    if (err != cudaSuccess) {
+      perror("extract_shared_cache"); 
+      exit(EXIT_FAILURE); 
+    }
+
+    close(conn_fd);
+  }
+
+  /* CoGNN: insert shared GPU memory to large block pool */
+  void insertSharedCache(size_t size, size_t offset) {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    int device;
+    C10_CUDA_CHECK(cudaGetDevice(&device));
+    Block* block = new Block(
+      device, 
+      cuda::getCurrentCUDAStream(device), 
+      size, 
+      &large_blocks, 
+      static_cast<char*>(COGNN_shared_ptr) + offset);
+    // allocated_size += size;
+    large_blocks.insert(block);
+
+    StatTypes stat_types;
+    stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
+    stat_types[static_cast<size_t>(get_stat_type_for_pool(*(block->pool)))] = true;
+    update_stat_array(stats.reserved_bytes, size, stat_types);
+    return;
+  }
+
+  /* CoGNN: clear shared GPU memory */
+  void clearSharedCache() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+
+    int device;
+    C10_CUDA_CHECK(cudaGetDevice(&device));
+    cudaStream_t stream = cuda::getCurrentCUDAStream(device);
+
+    std::cout << "Begin Clear" << std::endl;
+    auto it = large_blocks.begin();
+    while (it != large_blocks.end()) {
+      Block* block = *it;
+      if (block->stream == stream && !block->prev && !block->next) {
+        std::cout << "Clear" << ", " << block->ptr << ", "
+                  << block->size << ", " << block->allocated << std::endl;
+
+        StatTypes stat_types;
+        stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
+        stat_types[static_cast<size_t>(get_stat_type_for_pool(*(block->pool)))] = true;
+        update_stat_array(stats.reserved_bytes, -block->size, stat_types);
+
+        auto cur = it;
+        ++it;
+        large_blocks.erase(cur);
+        delete block;
+      }
+      else {
+        ++it;
+      }
+    }
+    std::cout << "End Clear" << std::endl;
+    // allocated_size = 0;
+  }
+
+  /** Retrieves info (total size + largest block) of the memory cache **/
+  void cacheInfo(size_t* total, size_t* largest)
+  {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    if (*largest == 0) {  // make an initial guess if a zero *largest is passed in
+      size_t tmp_bytes;
+      cudaMemGetInfo(largest,  // Use free memory as an optimistic initial guess of *largest
+                     &tmp_bytes);
+    }
+    cache_info_aux(large_blocks, total, largest);
+    cache_info_aux(small_blocks, total, largest);
+  }
+
+  /** Returns a copy of the memory allocator stats **/
+  DeviceStats getStats() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    return stats;
+  }
+
+  /** Resets the historical accumulation stats for the device **/
+  void resetAccumulatedStats() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+
+    for (size_t statType = 0; statType < static_cast<size_t>(StatType::NUM_TYPES); ++statType) {
+      reset_accumulated_stat(stats.allocation[statType]);
+      reset_accumulated_stat(stats.segment[statType]);
+      reset_accumulated_stat(stats.active[statType]);
+      reset_accumulated_stat(stats.inactive_split[statType]);
+      reset_accumulated_stat(stats.allocated_bytes[statType]);
+      reset_accumulated_stat(stats.reserved_bytes[statType]);
+      reset_accumulated_stat(stats.active_bytes[statType]);
+      reset_accumulated_stat(stats.inactive_split_bytes[statType]);
+    }
+
+    stats.num_alloc_retries = 0;
+    stats.num_ooms = 0;
+  }
+
+  /** Resets the historical peak stats for the device **/
+  void resetPeakStats() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+
+    for (size_t statType = 0; statType < static_cast<size_t>(StatType::NUM_TYPES); ++statType) {
+      reset_peak_stat(stats.allocation[statType]);
+      reset_peak_stat(stats.segment[statType]);
+      reset_peak_stat(stats.active[statType]);
+      reset_peak_stat(stats.inactive_split[statType]);
+      reset_peak_stat(stats.allocated_bytes[statType]);
+      reset_peak_stat(stats.reserved_bytes[statType]);
+      reset_peak_stat(stats.active_bytes[statType]);
+      reset_peak_stat(stats.inactive_split_bytes[statType]);
+    }
+  }
+
+  /** Dump a complete snapshot of the memory held by the allocator. Potentially VERY expensive. **/
+  std::vector<SegmentInfo> snapshot() const {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+
+    std::vector<SegmentInfo> result;
+    const auto all_blocks = get_all_blocks();
+
+    for (const Block* const head_block : all_blocks) {
+      if (head_block->prev != nullptr) {
+        continue;
+      }
+      result.emplace_back();
+      SegmentInfo& segment_info = result.back();
+      segment_info.device = head_block->device;
+      segment_info.address = reinterpret_cast<int64_t>(head_block->ptr);
+      segment_info.is_large = (head_block->pool == &large_blocks);
+
+      const Block* block = head_block;
+      while (block != nullptr) {
+        segment_info.blocks.emplace_back();
+        BlockInfo& block_info = segment_info.blocks.back();
+
+        block_info.size = block->size;
+        block_info.allocated = block->allocated;
+        block_info.active = block->allocated || (block->event_count > 0);
+
+        segment_info.total_size += block_info.size;
+        if (block_info.allocated) {
+          segment_info.allocated_size += block_info.size;
+        }
+        if (block_info.active) {
+          segment_info.active_size += block_info.size;
+        }
+
+        block = block->next;
+      }
+    }
+
+    std::sort(result.begin(), result.end(), [](const SegmentInfo& a, const SegmentInfo& b) {
+      return a.address < b.address;
+    });
+
+    return result;
+  }
+
+  static size_t round_size(size_t size) {
+    if (size < kMinBlockSize) {
+      return kMinBlockSize;
+    } else {
+      return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
+    }
+  }
+
+ private:
+
+  // All private methods do not acquire the allocator mutex.
+
+  std::vector<const Block*> get_all_blocks() const {
+    std::vector<const Block*> blocks;
+    blocks.insert(blocks.end(), small_blocks.begin(), small_blocks.end());
+    blocks.insert(blocks.end(), large_blocks.begin(), large_blocks.end());
+    blocks.insert(blocks.end(), active_blocks.begin(), active_blocks.end());
+    return blocks;
+  }
+
+  /** moves a block into a pool of cached free blocks */
+  void free_block(Block* block)
+  {
+    TORCH_INTERNAL_ASSERT(!block->allocated && block->event_count == 0);
+
+    size_t original_block_size = block->size;
+
+    auto& pool = *block->pool;
+    int64_t net_change_inactive_split_blocks = 0;
+    int64_t net_change_inactive_split_size = 0;
+
+    const std::array<Block*, 2> merge_candidates = {block->prev, block->next};
+    for (Block* merge_candidate : merge_candidates) {
+      const int64_t subsumed_size = try_merge_blocks(block, merge_candidate, pool);
+      if (subsumed_size > 0) {
+        net_change_inactive_split_blocks -= 1;
+        net_change_inactive_split_size -= subsumed_size;
+      }
+    }
+
+    active_blocks.erase(block);
+    pool.insert(block);
+
+    if (block->is_split()) {
+      net_change_inactive_split_blocks += 1;
+      net_change_inactive_split_size += block->size;
+    }
+
+    StatTypes stat_types;
+    stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
+    stat_types[static_cast<size_t>(get_stat_type_for_pool(*(block->pool)))] = true;
+    update_stat_array(stats.inactive_split, net_change_inactive_split_blocks, stat_types);
+    update_stat_array(stats.inactive_split_bytes, net_change_inactive_split_size, stat_types);
+    update_stat_array(stats.active, -1, stat_types);
+    update_stat_array(stats.active_bytes, -original_block_size, stat_types);
+  }
+
+  /** combine previously split blocks. returns the size of the subsumed block, or 0 on failure. */
+  size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool)
+  {
+    if (!src || src->allocated || src->event_count > 0) {
+      return 0;
+    }
+
+    AT_ASSERT(dst->is_split() && src->is_split());
+
+    if (dst->prev == src) {
+      dst->ptr = src->ptr;
+      dst->prev = src->prev;
+      if (dst->prev) {
+        dst->prev->next = dst;
+      }
+    } else {
+      dst->next = src->next;
+      if (dst->next) {
+        dst->next->prev = dst;
+      }
+    }
+
+    const size_t subsumed_size = src->size;
+    dst->size += subsumed_size;
+    pool.erase(src);
+    delete src;
+
+    return subsumed_size;
+  }
+
+  BlockPool& get_pool(size_t size) {
+    if (size <= kSmallSize) {
+      return small_blocks;
+    } else {
+      return large_blocks;
+    }
+  }
+
+  StatType get_stat_type_for_pool(const BlockPool& pool) {
+    if (&pool == &small_blocks) {
+      return StatType::SMALL_POOL;
+    } else if (&pool == &large_blocks) {
+      return StatType::LARGE_POOL;
+    } else {
+      AT_ERROR("get_stat_type_for_pool: invalid pool");
+    }
+  }
+
+  bool should_split(const Block* block, size_t size) {
+    size_t remaining = block->size - size;
+    if (block->pool == &small_blocks) {
+      return remaining >= kMinBlockSize;
+    } else if (block->pool == &large_blocks) {
+      return remaining > kSmallSize;
+    } else {
+      AT_ERROR("should_split: invalid pool");
+    }
+  }
+
+  static size_t get_allocation_size(size_t size) {
+    if (size <= kSmallSize) {
+      return kSmallBuffer;
+    } else if (size < kMinLargeAlloc) {
+      return kLargeBuffer;
+    } else {
+      return kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
+    }
+  }
+
+  bool get_free_block(AllocParams& p) {
+    BlockPool& pool = *p.pool;
+    auto it = pool.lower_bound(&p.search_key);
+    if (it == pool.end() || (*it)->stream != p.stream())
+      return false;
+    p.block = *it;
+    pool.erase(it);
+    return true;
+  }
+
+  bool trigger_free_memory_callbacks(AllocParams& p) {
+    bool freed_memory = false;
+    for (const auto& name : FreeCudaMemoryCallbacksRegistry()->Keys()) {
+      freed_memory |=
+        FreeCudaMemoryCallbacksRegistry()->Create(name)->Execute();
+    }
+    return freed_memory;
+  }
+
+  bool alloc_block(AllocParams& p, bool isRetry) {
+    // Defensively checks for preexisting CUDA error state.
+    C10_CUDA_CHECK(cudaGetLastError());
+
+    size_t size = p.alloc_size;
+    void* ptr;
+
+    if (isRetry) {
+      stats.num_alloc_retries += 1;
+    }
+
+    if (set_fraction && total_allocated_memory + size > allowed_memory_maximum) {
+      p.err = cudaErrorMemoryAllocation;
+      return false;
+    } else {
+      p.err = cudaMalloc(&ptr, size);
+      if (p.err != cudaSuccess) {
+        if (p.err == cudaErrorMemoryAllocation) {
+          // If this is the first attempt (!isRetry), we can forgive and clear CUDA's
+          //   internal error state.
+          // If this is the second attempt (isRetry), malloc's TORCH_CHECK_WITH will take
+          //   over to throw a helpful exception. The user can choose to catch the exception,
+          //   free some stuff in their script, and attempt their allocation again.
+          //   In this case, we can also forgive and clear CUDA's internal error state.
+          cudaGetLastError();
+        } else {
+          // If the error's unrelated to memory allocation, we should throw immediately.
+          C10_CUDA_CHECK(p.err);
+        }
+        return false;
+      }
+    }
+
+    total_allocated_memory += size;
+    p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr);
+    update_stat_array(stats.segment, 1, p.stat_types);
+    update_stat_array(stats.reserved_bytes, size, p.stat_types);
+
+    // p.block came from new, not cudaMalloc.  It should not be nullptr here.
+    TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr);
+    return true;
+  }
+
+  bool free_cached_blocks()
+  {
+    // First ensure that all blocks that can't currently be allocated due to
+    // outstanding events are returned to the pool.
+    synchronize_and_free_events();
+
+    // Free all non-split cached blocks
+    free_blocks(large_blocks);
+    free_blocks(small_blocks);
+    return true;
+  }
+
+  void free_blocks(BlockPool& blocks)
+  {
+    // Frees all non-split blocks
+    auto it = blocks.begin();
+    while (it != blocks.end()) {
+      Block* block = *it;
+      if (!block->prev && !block->next) {
+        C10_CUDA_CHECK(cudaFree((void*)block->ptr));
+        total_allocated_memory -= block->size;
+
+        StatTypes stat_types;
+        stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
+        stat_types[static_cast<size_t>(get_stat_type_for_pool(*(block->pool)))] = true;
+        update_stat_array(stats.segment, -1, stat_types);
+        update_stat_array(stats.reserved_bytes, -block->size, stat_types);
+
+        auto cur = it;
+        ++it;
+        blocks.erase(cur);
+        delete block;
+      } else {
+        ++it;
+      }
+    }
+  }
+
+  cudaEvent_t create_event_internal() {
+    cudaEvent_t event;
+    C10_CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+    return event;
+  }
+
+  void free_event_internal(cudaEvent_t event) {
+    C10_CUDA_CHECK(cudaEventDestroy(event));
+  }
+
+  void synchronize_and_free_events() {
+    // Synchronize on outstanding events and then free associated blocks.
+
+    for (auto& e : cuda_events) {
+      cudaEvent_t event = e.first;
+      Block* block = e.second;
+
+      C10_CUDA_CHECK(cudaEventSynchronize(event));
+      free_event_internal(event);
+
+      block->event_count--;
+      if (block->event_count == 0) {
+        free_block(block);
+      }
+    }
+
+    cuda_events.clear();
+  }
+
+  void insert_events(Block* block)
+  {
+    int prev_device;
+    C10_CUDA_CHECK(cudaGetDevice(&prev_device));
+
+    stream_set streams(std::move(block->stream_uses));
+    AT_ASSERT(block->stream_uses.empty());
+    for (auto it = streams.begin(); it != streams.end(); ++it) {
+      C10_CUDA_CHECK(cudaSetDevice(it->device_index()));
+
+      cudaEvent_t event = create_event_internal();
+      C10_CUDA_CHECK(cudaEventRecord(event, it->stream()));
+
+      block->event_count++;
+      cuda_events.emplace_back(event, block);
+    }
+
+    C10_CUDA_CHECK(cudaSetDevice(prev_device));
+  }
+
+  void process_events()
+  {
+    // Process outstanding cudaEvents. Events that are completed are removed
+    // from the queue, and the 'event_count' for the corresponding allocation
+    // is decremented. Stops at the first event which has not been completed.
+    // Since events on different devices or streams may occur out of order,
+    // the processing of some events may be delayed.
+    while (!cuda_events.empty()) {
+      auto& e = cuda_events.front();
+      cudaEvent_t event = e.first;
+      Block* block = e.second;
+
+      cudaError_t err = cudaEventQuery(event);
+      if (err == cudaErrorNotReady) {
+        // ignore and clear the error if not ready
+        cudaGetLastError();
+        break;
+      } else if (err != cudaSuccess) {
+        C10_CUDA_CHECK(err);
+      }
+
+      free_event_internal(event);
+
+      block->event_count--;
+      if (block->event_count == 0) {
+        free_block(block);
+      }
+      cuda_events.pop_front();
+    }
+  }
+
+  // Accumulates sizes of all memory blocks for given device in given pool
+  void cache_info_aux(BlockPool& blocks, size_t* total, size_t* largest)
+  {
+    for (const auto& block : blocks) {
+      size_t blocksize = block->size;
+      *total += blocksize;
+      if (blocksize > *largest) {
+        *largest = blocksize;
+      }
+    }
+  }
+};
+
+class THCCachingAllocator {
+
+ private:
+
+  std::mutex mutex;
+
+  // allocated blocks by device pointer
+  std::unordered_map<void*, Block*> allocated_blocks;
+
+  // lock around calls to cudaFree (to prevent deadlocks with NCCL)
+  mutable std::mutex cuda_free_mutex;
+
+  void add_allocated_block(Block* block) {
+    std::lock_guard<std::mutex> lock(mutex);
+    allocated_blocks[block->ptr] = block;
+  }
+
+ public:
+
+  std::vector<std::unique_ptr<DeviceCachingAllocator>> device_allocator;
+
+  std::mutex* getCudaFreeMutex() const {
+    return &cuda_free_mutex;
+  }
+
+  Block* get_allocated_block(void *ptr, bool remove=false) {
+    std::lock_guard<std::mutex> lock(mutex);
+    auto it = allocated_blocks.find(ptr);
+    if (it == allocated_blocks.end()) {
+      return nullptr;
+    }
+    Block* block = it->second;
+    if (remove) {
+      allocated_blocks.erase(it);
+    }
+    return block;
+  }
+
+  void init(int device_count) {
+    int size = device_allocator.size();
+    if (size < device_count) {
+      device_allocator.resize(device_count);
+      for (int i = size; i < device_count; i++) {
+        device_allocator[i] = std::unique_ptr<DeviceCachingAllocator>(new DeviceCachingAllocator());
+      }
+    }
+  }
+
+  /** allocates a block which is safe to use from the provided stream */
+  void malloc(void** devPtr, int device, size_t size, cudaStream_t stream) {
+    TORCH_INTERNAL_ASSERT(
+        0 <= device && device < device_allocator.size(),
+        "Allocator not initialized for device ",
+        device,
+        ": did you call init?");
+    Block* block = device_allocator[device]->malloc(device, size, stream);
+    add_allocated_block(block);
+    *devPtr = (void*)block->ptr;
+  }
+
+  void free(void* ptr) {
+    if (!ptr) {
+      return;
+    }
+    Block* block = get_allocated_block(ptr, true /* remove */);
+    if (!block) {
+      AT_ERROR("invalid device pointer: ", ptr);
+    }
+    device_allocator[block->device]->free(block);
+  }
+
+  void setMemoryFraction(double fraction, int device) {
+    TORCH_INTERNAL_ASSERT(
+        0 <= device && device < device_allocator.size(),
+        "Allocator not initialized for device ",
+        device,
+        ": did you call init?");
+    TORCH_INTERNAL_ASSERT(
+        0 <= fraction  && fraction <= 1,
+        "invalid fraction:",
+        fraction,
+        ". Please set within (0, 1).");
+    int activated_device;
+    cudaGetDevice (&activated_device);
+    if (activated_device != device) {
+        cudaSetDevice(device);
+    }
+    device_allocator[device]->setMemoryFraction(fraction);
+  }
+
+  void emptyCache() {
+    int count = device_allocator.size();
+    for (int i = 0; i < count; i++)
+      device_allocator[i]->emptyCache();
+  }
+
+  /* CoGNN: allocate shared GPU memory (devices) */
+  void allocateSharedCache() {
+    int count = device_allocator.size();
+    for (int i = 0; i < count; i++)
+      device_allocator[i]->allocateSharedCache();
+  }
+
+  /* CoGNN: send shared GPU memory (devices) */
+  void sendSharedCache() {
+    int count = device_allocator.size();
+    for (int i = 0; i < count; i++)
+      device_allocator[i]->sendSharedCache();
+  }
+
+  /* CoGNN: recv shared GPU memory (devices) */
+  void recvSharedCache() {
+    int count = device_allocator.size();
+    for (int i = 0; i < count; i++)
+      device_allocator[i]->recvSharedCache();
+  }
+
+  /* CoGNN: insert shared GPU memory to large block pool (devices) */
+  void insertSharedCache(size_t size, size_t offset) {
+    int count = device_allocator.size();
+    for (int i = 0; i < count; i++)
+      device_allocator[i]->insertSharedCache(size, offset);
+  }
+
+  /* CoGNN: recv shared GPU memory (devices) */
+  void clearSharedCache() {
+    int count = device_allocator.size();
+    for (int i = 0; i < count; i++)
+      device_allocator[i]->clearSharedCache();
+  }
+
+  void* getBaseAllocation(void* ptr, size_t* outSize)
+  {
+    Block* block = get_allocated_block(ptr);
+    if (!block) {
+      AT_ERROR("invalid device pointer: ", ptr);
+    }
+    return device_allocator[block->device]->getBaseAllocation(block, outSize);
+  }
+
+  void recordStream(const DataPtr& ptr, cuda::CUDAStream stream) {
+    // Empty tensor's storage().data() might be a null ptr. As there is no
+    // blocks associated with those tensors, it is fine to do nothing here.
+    if (!ptr.get()) {
+      return;
+    }
+
+    // If a tensor is not allocated by this instance, simply skip
+    // This usually happens when CUDA tensors are shared across processes,
+    // we have implemented reference counting based sharing mechanism to
+    // guarantee tensors won't be accidentally freed by one process while
+    // they are still being used in another
+    if (ptr.get_deleter() != &raw_delete)
+      return;
+
+    Block* block = get_allocated_block(ptr.get());
+    // block must not be null reaching here
+    TORCH_INTERNAL_ASSERT(block != nullptr, "No allocated block can be found");
+    device_allocator[block->device]->recordStream(block, stream);
+  }
+
+  std::vector<SegmentInfo> snapshot() {
+    std::vector<SegmentInfo> result;
+    int count = device_allocator.size();
+    for (int i = 0; i < count; i++) {
+      auto snap = device_allocator[i]->snapshot();
+      result.insert(result.end(), snap.begin(), snap.end());
+    }
+
+    return result;
+  }
+};
+
+THCCachingAllocator caching_allocator;
+
+// Returns whether to force all allocations to bypass the caching allocator and
+// go straight to cudaMalloc.  This setting is useful when debugging GPU memory
+// errors, since the caching allocator foils cuda-memcheck.
+bool forceUncachedAllocator() {
+  static bool force_uncached =
+      getenv("PYTORCH_NO_CUDA_MEMORY_CACHING") != nullptr;
+  return force_uncached;
+}
+
+static void uncached_delete(void* ptr) {
+  C10_CUDA_CHECK(cudaFree(ptr));
+}
+
+// NB: I decided not to fold this into THCCachingAllocator, because the latter
+// has a lot more methods and it wasn't altogether clear that they should
+// actually be publicly exposed
+struct CudaCachingAllocator : public Allocator {
+  DataPtr allocate(size_t size) const override {
+    int device;
+    C10_CUDA_CHECK(cudaGetDevice(&device));
+    void* r = nullptr;
+    if (forceUncachedAllocator()) {
+      C10_CUDA_CHECK(cudaMalloc(&r, size));
+      return {r, r, &uncached_delete, Device(DeviceType::CUDA, device)};
+    }
+    if (size != 0) {
+      caching_allocator.malloc(&r, device, size, cuda::getCurrentCUDAStream(device));
+    }
+    return {r, r, &raw_delete, Device(DeviceType::CUDA, device)};
+  }
+  DeleterFnPtr raw_deleter() const override {
+    return &raw_delete;
+  }
+};
+
+CudaCachingAllocator device_allocator;
+
+Allocator* get(void)
+{
+  return &device_allocator;
+}
+
+void init(int device_count) {
+  caching_allocator.init(device_count);
+}
+
+void setMemoryFraction(double fraction, int device) {
+  caching_allocator.setMemoryFraction(fraction, device);
+}
+
+void emptyCache(void) {
+  caching_allocator.emptyCache();
+}
+
+// CoGNN
+void allocateSharedCache(void) {
+  caching_allocator.allocateSharedCache();
+}
+
+// CoGNN
+void sendSharedCache(void) {
+  caching_allocator.sendSharedCache();
+}
+
+// CoGNN
+void recvSharedCache(void) {
+  caching_allocator.recvSharedCache();
+}
+
+// CoGNN
+void insertSharedCache(size_t size, size_t offset) {
+    caching_allocator.insertSharedCache(size, offset);
+}
+
+// CoGNN
+void clearSharedCache(void) {
+  caching_allocator.clearSharedCache();
+}
+
+void cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock) {
+  caching_allocator.device_allocator[dev_id]->cacheInfo(cachedAndFree, largestBlock);
+}
+
+void* getBaseAllocation(void *ptr, size_t *size)
+{
+  return caching_allocator.getBaseAllocation(ptr, size);
+}
+
+void recordStream(const DataPtr& ptr, cuda::CUDAStream stream)
+{
+  caching_allocator.recordStream(ptr, stream);
+}
+
+std::mutex* getFreeMutex()
+{
+  return caching_allocator.getCudaFreeMutex();
+}
+
+static inline void assertValidDevice(int device) {
+  int device_num = caching_allocator.device_allocator.size();
+  TORCH_CHECK(0 <= device && device < device_num, "Invalid device argument.");
+}
+
+DeviceStats getDeviceStats(int device) {
+  assertValidDevice(device);
+  return caching_allocator.device_allocator[device]->getStats();
+}
+
+void resetAccumulatedStats(int device) {
+  assertValidDevice(device);
+  caching_allocator.device_allocator[device]->resetAccumulatedStats();
+}
+
+void resetPeakStats(int device) {
+  assertValidDevice(device);
+  caching_allocator.device_allocator[device]->resetPeakStats();
+}
+
+std::vector<SegmentInfo> snapshot() {
+  return caching_allocator.snapshot();
+}
+
+//
+// In CUDA IPC, sender sends a tensor to receiver, getIpcDevPtr
+// is called by the receiving process to map the CUDA memory from the sending
+// process into its own address space.
+//
+// CUDA IPC only allows sharing a big memory block associated with a cudaIpcMemHandle_t
+// and it can be opened only **once** per context per process. There can be
+// multiple types of storage in the same IPC mem block, so we must cache the
+// device ptr to construct typed storage as it comes.
+//
+// ipcMemHandle_to_devptr maps a cudaIpcMemHandle_t to a device pointer in the process
+// that can be used to access the memory block in the sender process.
+// It only saves a weak_ptr of the device pointer in the map, the shared_ptr
+// will be used to reconstruct all storages in this CudaMalloc allocation.
+// And it will deleted in cudaIpcCloseMemHandle when its reference count is 0.
+//
+namespace {
+  std::mutex IpcMutex;
+  std::unordered_map<std::string, std::weak_ptr<void>> ipcMemHandle_to_devptr;
+}
+
+std::shared_ptr<void> getIpcDevPtr(std::string handle) {
+  std::lock_guard<std::mutex> lock(IpcMutex);
+
+  auto iter = ipcMemHandle_to_devptr.find(handle);
+  if (iter != ipcMemHandle_to_devptr.end()) {
+    auto devptr = iter->second.lock();
+    if (devptr) return devptr;
+  }
+  // This ipcMemHandle hasn't been opened, or already expired, open it to
+  // enable IPC access to that mem block.
+  void *dev = nullptr;
+  auto ipc_handle = reinterpret_cast<const cudaIpcMemHandle_t*>(handle.c_str());
+  C10_CUDA_CHECK(cudaIpcOpenMemHandle(&dev, *ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+  // devPtr has to be deleted in same device when created.
+  int curr_device;
+  C10_CUDA_CHECK(cudaGetDevice(&curr_device));
+  auto sp = std::shared_ptr<void>(
+      dev,
+      [handle, curr_device](void *ptr) {
+        cuda::CUDAGuard device_guard(curr_device);
+        std::lock_guard<std::mutex> deleter_lock(IpcMutex);
+        C10_CUDA_CHECK(cudaIpcCloseMemHandle(ptr));
+        ipcMemHandle_to_devptr.erase(handle);});
+  std::weak_ptr<void> wp = sp;
+  // To eliminate an additional search, we can use insert().
+  // It doesn't overwrite when key already exists(ptr expired).
+  // But in the deleter for sp we erased the entry,
+  // this should be safe to do now.
+  ipcMemHandle_to_devptr.insert(iter, {handle, wp});
+
+  return sp;
+}
+
+void* raw_alloc(size_t nbytes) {
+  if (nbytes == 0) {
+    return nullptr;
+  }
+  int device;
+  C10_CUDA_CHECK(cudaGetDevice(&device));
+  void* r = nullptr;
+  caching_allocator.malloc(&r, device, nbytes, cuda::getCurrentCUDAStream(device));
+  return r;
+}
+
+void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) {
+  if (nbytes == 0) {
+    return nullptr;
+  }
+  int device;
+  C10_CUDA_CHECK(cudaGetDevice(&device));
+  void* r = nullptr;
+  caching_allocator.malloc(&r, device, nbytes, stream);
+  return r;
+}
+
+void raw_delete(void* ptr) {
+  caching_allocator.free(ptr);
+}
+
+} // namespace CUDACachingAllocator
+
+}} // namespace c10::cuda
diff --git a/software/plugin/CUDACachingAllocator.h b/software/plugin/CUDACachingAllocator.h
new file mode 100644
index 0000000..6f721de
--- /dev/null
+++ b/software/plugin/CUDACachingAllocator.h
@@ -0,0 +1,136 @@
+#ifndef THC_DEVICE_ALLOCATOR_INC
+#define THC_DEVICE_ALLOCATOR_INC
+
+#include <c10/cuda/CUDAStream.h>
+#include <c10/core/Allocator.h>
+#include <c10/cuda/CUDAMacros.h>
+#include <c10/util/Registry.h>
+
+#include <array>
+#include <mutex>
+
+namespace c10 {
+
+class C10_CUDA_API CUDAOutOfMemoryError : public c10::Error {
+  using Error::Error;
+};
+
+// Caching allocator will execute every registered callback if it unable to find
+// block inside of already allocated area.
+class C10_CUDA_API FreeMemoryCallback {
+ public:
+  virtual ~FreeMemoryCallback() {};
+  virtual bool Execute() = 0;
+};
+
+C10_DECLARE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
+#define REGISTER_FREE_MEMORY_CALLBACK(name, ...) \
+  C10_REGISTER_CLASS(FreeCudaMemoryCallbacksRegistry, name, __VA_ARGS__);
+
+namespace cuda {
+
+// TODO: Turn this into an honest to goodness class. I briefly attempted to do
+// this, but it was a bit irritating to figure out how to also correctly
+// apply pimpl pattern so I didn't have to leak any internal implementation
+// details in the header (CUDACachingAllocator could be made a pimpl, but
+// you also need to appropriately define a class which is a subclass
+// of Allocator. Not impossible, but required a bit more surgery than
+// I wanted to do at the time.)
+//
+// Why is this using a namespace rather than old-style THCCachingAllocator_
+// prefix?  Mostly because it made the HIPify rules easier to write; _ is
+// not counted as a word boundary, so you would otherwise have to list each
+// of these functions.
+
+namespace CUDACachingAllocator {
+
+struct Stat {
+  int64_t current = 0;
+  int64_t peak = 0;
+  int64_t allocated = 0;
+  int64_t freed = 0;
+};
+
+enum struct StatType : uint64_t {
+  AGGREGATE = 0,
+  SMALL_POOL = 1,
+  LARGE_POOL = 2,
+  NUM_TYPES = 3  // remember to update this whenever a new stat type is added
+};
+
+typedef std::array<Stat, static_cast<size_t>(StatType::NUM_TYPES)> StatArray;
+
+// Struct containing memory allocator summary statistics for a device.
+struct DeviceStats {
+  // COUNT: allocations requested by client code
+  StatArray allocation;
+  // COUNT: number of allocated segments from cudaMalloc().
+  StatArray segment;
+  // COUNT: number of active memory blocks (allocated or used by stream)
+  StatArray active;
+  // COUNT: number of inactive, split memory blocks (unallocated but can't be released via cudaFree)
+  StatArray inactive_split;
+
+  // SUM: bytes requested by client code
+  StatArray allocated_bytes;
+  // SUM: bytes reserved by this memory allocator (both free and used)
+  StatArray reserved_bytes;
+  // SUM: bytes within active memory blocks
+  StatArray active_bytes;
+  // SUM: bytes within inactive, split memory blocks
+  StatArray inactive_split_bytes;
+
+  // COUNT: total number of failed calls to CUDA malloc necessitating cache flushes.
+  int64_t num_alloc_retries = 0;
+
+  // COUNT: total number of OOMs (i.e. failed calls to CUDA after cache flush)
+  int64_t num_ooms = 0;
+};
+
+// Struct containing info of an allocation block (i.e. a fractional part of a cudaMalloc)..
+struct BlockInfo {
+  int64_t size = 0;
+  bool allocated = false;
+  bool active = false;
+};
+
+// Struct containing info of a memory segment (i.e. one contiguous cudaMalloc).
+struct SegmentInfo {
+  int64_t device = 0;
+  int64_t address = 0;
+  int64_t total_size = 0;
+  int64_t allocated_size = 0;
+  int64_t active_size = 0;
+  bool is_large = false;
+  std::vector<BlockInfo> blocks;
+};
+
+C10_CUDA_API void* raw_alloc(size_t nbytes);
+C10_CUDA_API void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream);
+C10_CUDA_API void raw_delete(void* ptr);
+
+C10_CUDA_API Allocator* get();
+C10_CUDA_API void init(int device_count);
+C10_CUDA_API void setMemoryFraction(double fraction, int device);
+C10_CUDA_API void emptyCache();
+C10_CUDA_API void allocateSharedCache(); // CoGNN
+C10_CUDA_API void sendSharedCache(); // CoGNN
+C10_CUDA_API void recvSharedCache(); // CoGNN
+C10_CUDA_API void insertSharedCache(size_t size, size_t offset); // CoGNN
+C10_CUDA_API void clearSharedCache(); // CoGNN
+C10_CUDA_API void cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock);
+C10_CUDA_API void* getBaseAllocation(void *ptr, size_t *size);
+C10_CUDA_API void recordStream(const DataPtr&, CUDAStream stream);
+C10_CUDA_API DeviceStats getDeviceStats(int device);
+C10_CUDA_API void resetAccumulatedStats(int device);
+C10_CUDA_API void resetPeakStats(int device);
+C10_CUDA_API std::vector<SegmentInfo> snapshot();
+
+C10_CUDA_API std::mutex* getFreeMutex();
+
+C10_CUDA_API std::shared_ptr<void> getIpcDevPtr(std::string handle);
+} // namespace CUDACachingAllocator
+
+}} // namespace c10::cuda
+
+#endif
diff --git a/software/plugin/Module.cpp b/software/plugin/Module.cpp
new file mode 100644
index 0000000..e54fcf1
--- /dev/null
+++ b/software/plugin/Module.cpp
@@ -0,0 +1,670 @@
+#include <array>
+#include <unordered_map>
+#include <thread>
+#include <chrono>
+#include <sstream>
+#include <TH/TH.h>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/CUDAGeneratorImpl.h>
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#ifdef USE_NCCL
+#include <torch/csrc/cuda/python_nccl.h>
+#endif
+
+#include <torch/csrc/cuda/THCP.h>
+#include <torch/csrc/CudaIPCTypes.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/cuda_lazy_init.h>
+#include <torch/csrc/utils/python_numbers.h>
+#include <torch/csrc/utils/python_strings.h>
+#include <torch/csrc/cuda/python_comm.h>
+#include <torch/csrc/Generator.h>
+#include <torch/csrc/python_headers.h>
+
+#ifndef WIN32
+#include <pthread.h>
+#endif
+
+using namespace torch;
+
+THCState *state = nullptr;
+static bool in_bad_fork = false;  // True for children forked after cuda init
+
+#ifndef WIN32
+// Called in the forked child if cuda has already been initialized
+static void forked_child() {
+  in_bad_fork = true;
+  torch::utils::set_run_yet_variable_to_false();
+  state = nullptr;
+}
+#endif
+
+// Should be called before the first cuda call.
+// Note: This is distinct from initExtension because a stub cuda implementation
+// has some working functions (e.g. device_count) but cannot fully initialize.
+static void poison_fork() {
+#ifndef WIN32
+  static std::once_flag flag;
+  std::call_once(flag, []{ pthread_atfork(nullptr, nullptr, forked_child); });
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// CUDA management methods
+////////////////////////////////////////////////////////////////////////////////
+
+void THCPModule_setDevice(int device)
+{
+  c10::cuda::set_device(static_cast<c10::DeviceIndex>(device));
+}
+
+PyObject * THCPModule_setDevice_wrap(PyObject *self, PyObject *arg)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(THPUtils_checkLong(arg), "invalid argument to setDevice");
+  int64_t device = THPUtils_unpackLong(arg);
+
+  torch::utils::cuda_lazy_init();
+  THCPModule_setDevice(device);
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_getDevice_wrap(PyObject *self, PyObject *noargs)
+{
+  HANDLE_TH_ERRORS
+  torch::utils::cuda_lazy_init();
+  auto device = static_cast<int>(c10::cuda::current_device());
+  return THPUtils_packInt32(device);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_canDeviceAccessPeer_wrap(PyObject *self, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  PyObject* arg1 = nullptr;
+  PyObject* arg2 = nullptr;
+  if(!PyArg_ParseTuple(args, "OO", &arg1, &arg2)) {
+    THPUtils_invalidArguments(
+        args,
+        nullptr,
+        "can_device_peer_access",
+        1,
+        "(int device, int peer_device);");
+    return nullptr;
+  }
+  THPUtils_assert(THPUtils_checkLong(arg1), "invalid argument to canDeviceAccessPeer");
+  THPUtils_assert(THPUtils_checkLong(arg2), "invalid argument to canDeviceAccessPeer");
+  int64_t device = THPUtils_unpackLong(arg1);
+  int64_t peer_device = THPUtils_unpackLong(arg2);
+
+  torch::utils::cuda_lazy_init();
+  auto can_access = at::cuda::canDeviceAccessPeer(device, peer_device);
+  return PyBool_FromLong(can_access);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_getDeviceCount_wrap(PyObject *self, PyObject *noargs)
+{
+  HANDLE_TH_ERRORS
+  poison_fork();
+  return THPUtils_packUInt64(at::cuda::device_count());
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_getArchFlags(PyObject *self, PyObject *noargs)
+{
+  HANDLE_TH_ERRORS
+  poison_fork();
+#ifdef CUDA_ARCH_FLAGS
+  static const char* flags = C10_STRINGIZE(CUDA_ARCH_FLAGS);
+  return THPUtils_packString(flags);
+#else
+  Py_RETURN_NONE;
+#endif
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THCPModule_isInBadFork(PyObject *self, PyObject *noargs) {
+  HANDLE_TH_ERRORS
+  return PyBool_FromLong(in_bad_fork);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_getCurrentStream_wrap(
+    PyObject * /* unused */, PyObject *device_index) {
+  HANDLE_TH_ERRORS
+  THPUtils_assert(
+    THPUtils_checkLong(device_index), "invalid argument to getCurrentStream");
+  int64_t device = THPUtils_unpackLong(device_index);
+  return PyLong_FromUnsignedLongLong(
+    at::cuda::getCurrentCUDAStream(device).pack());
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_getDefaultStream_wrap(
+    PyObject * /* unused */, PyObject *device_index) {
+  HANDLE_TH_ERRORS
+  THPUtils_assert(
+    THPUtils_checkLong(device_index), "invalid argument to getDefaultStream");
+  int64_t device = THPUtils_unpackLong(device_index);
+  return PyLong_FromUnsignedLongLong(
+    at::cuda::getDefaultCUDAStream(device).pack());
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_setStream_wrap(PyObject *self, PyObject *obj)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(PyLong_Check(obj), "invalid stream");
+  uint64_t bits = PyLong_AsUnsignedLongLong(obj);
+  if (bits == static_cast<uint64_t>(-1) && PyErr_Occurred()) {
+    throw python_error();
+  }
+  auto stream = at::cuda::CUDAStream::unpack(bits);
+  auto device = static_cast<int>(c10::cuda::current_device());
+  if (device != stream.device_index()) {
+    THCPModule_setDevice(stream.device_index());
+  }
+  at::cuda::setCurrentCUDAStream(stream);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_getCompiledVersion(PyObject *self, PyObject *noargs)
+{
+  return THPUtils_packInt64((int64_t) CUDA_VERSION);
+}
+
+PyObject * THCPModule_cudaHostAllocator(PyObject *_unused, PyObject *noargs)
+{
+  HANDLE_TH_ERRORS
+  c10::Allocator* allocator = THCState_getCudaHostAllocator(state);
+  return PyLong_FromVoidPtr(allocator);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_cudaCachingAllocator_raw_alloc(PyObject *_unused, PyObject *args){
+  HANDLE_TH_ERRORS
+  PyObject* size_o = nullptr;
+  PyObject* stream_o = nullptr;
+  if(!PyArg_ParseTuple(args, "OO", &size_o, &stream_o)) {
+    THPUtils_invalidArguments(
+        args,
+        nullptr,
+        "caching_allocator_alloc",
+        1,
+        "(ssize_t size, intptr_t stream);");
+    return nullptr;
+  }
+  ssize_t size = PyLong_AsSsize_t(size_o);
+  cudaStream_t stream = static_cast<cudaStream_t>(PyLong_AsVoidPtr(stream_o));
+  void* mem = c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(size, stream);
+  return PyLong_FromVoidPtr(mem);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_cudaCachingAllocator_raw_delete(PyObject *_unused, PyObject *obj){
+  HANDLE_TH_ERRORS
+  void* mem_ptr = PyLong_AsVoidPtr(obj);
+  c10::cuda::CUDACachingAllocator::raw_delete(mem_ptr);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_cudaSynchronize(PyObject *_unused, PyObject *noargs)
+{
+  HANDLE_TH_ERRORS
+  c10::cuda::device_synchronize();
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_cudaIPCCollect(PyObject *_unused, PyObject *noargs)
+{
+  HANDLE_TH_ERRORS
+  torch::CudaIPCCollect();
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_cudaSleep(PyObject *_unused, PyObject *cycles)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(THPUtils_checkLong(cycles), "torch.cuda._sleep(): expected 'int'");
+  THC_sleep(LIBRARY_STATE THPUtils_unpackLong(cycles));
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+// We need to ensure that as long as a thread will NEVER loose the GIL as long as
+// it holds the CUDA mutex. Otherwise another thread might be scheduled and try to
+// e.g. allocate a new tensor which will cause a deadlock. It's enough to have a
+// single global, because it can be only set once (cudaMutex is not recursive)
+// by the thread that owns the mutex (obviously there can be only one such thread).
+static PyGILState_STATE cudaMutexGILState;
+
+PyObject * THCPModule_cudaLockMutex(PyObject *module, PyObject *noargs)
+{
+  auto mutex = c10::cuda::CUDACachingAllocator::getFreeMutex();
+  // This has to be a busy loop because we **absolutely need to** hold the GIL
+  // or it's a recipe for a deadlock otherwise (if we let other Python threads
+  // run while we have the cudaMutex, but not the GIL, they might try to e.g.
+  // free a CUDA tensor and acquire the cudaMutex without giving up the GIL,
+  // because it happens deep within THC).
+  while (true) {
+    if (mutex->try_lock())
+      break;
+    {
+      pybind11::gil_scoped_release no_gil;
+      std::this_thread::sleep_for(std::chrono::microseconds(10));
+    }
+  }
+
+  cudaMutexGILState = PyGILState_Ensure();
+  Py_RETURN_NONE;
+}
+
+PyObject * THCPModule_cudaUnlockMutex(PyObject *module, PyObject *noargs)
+{
+  auto mutex = c10::cuda::CUDACachingAllocator::getFreeMutex();
+  PyGILState_Release(cudaMutexGILState);
+  mutex->unlock();
+  Py_RETURN_NONE;
+}
+
+PyObject * THCPModule_hasPrimaryContext(PyObject *_unused, PyObject *arg)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(THPUtils_checkLong(arg), "invalid argument to has_primary_context");
+  int64_t device_index = static_cast<int64_t>(THPUtils_unpackLong(arg));
+  if (at::detail::getCUDAHooks().hasPrimaryContext(device_index)) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_setMemoryFraction(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  PyObject* fraction_o = nullptr;
+  PyObject* device_o = nullptr;
+  if(!PyArg_ParseTuple(args, "OO", &fraction_o, &device_o)) {
+    THPUtils_invalidArguments(
+        args,
+        nullptr,
+        "set_memory_fraction",
+        1,
+        "(double fraction, int device);");
+    return nullptr;
+  }
+  double fraction = PyFloat_AsDouble(fraction_o);
+  int64_t device = PyLong_AsLongLong(device_o);
+
+  c10::cuda::CUDACachingAllocator::setMemoryFraction(fraction, device);
+  END_HANDLE_TH_ERRORS
+  Py_RETURN_NONE;
+}
+
+PyObject * THCPModule_emptyCache(PyObject *_unused, PyObject *noargs)
+{
+  HANDLE_TH_ERRORS
+  c10::cuda::CUDACachingAllocator::emptyCache();
+  END_HANDLE_TH_ERRORS
+  Py_RETURN_NONE;
+}
+
+// CoGNN
+PyObject * THCPModule_allocateSharedCache(PyObject *_unused, PyObject *noargs)
+{
+  HANDLE_TH_ERRORS
+  c10::cuda::CUDACachingAllocator::allocateSharedCache();
+  END_HANDLE_TH_ERRORS
+  Py_RETURN_NONE;
+}
+
+// CoGNN
+PyObject * THCPModule_sendSharedCache(PyObject *_unused, PyObject *noargs)
+{
+  HANDLE_TH_ERRORS
+  c10::cuda::CUDACachingAllocator::sendSharedCache();
+  END_HANDLE_TH_ERRORS
+  Py_RETURN_NONE;
+}
+
+// CoGNN
+PyObject * THCPModule_recvSharedCache(PyObject *_unused, PyObject *noargs)
+{
+  HANDLE_TH_ERRORS
+  c10::cuda::CUDACachingAllocator::recvSharedCache();
+  END_HANDLE_TH_ERRORS
+  Py_RETURN_NONE;
+}
+
+// CoGNN
+PyObject * THCPModule_insertSharedCacheForComputation(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+      PyObject* size_o = nullptr;
+      PyObject* offset_o = nullptr;
+
+      if(!PyArg_ParseTuple(args, "OO", &size_o, &offset_o)) {
+        THPUtils_invalidArguments(
+            args,
+            nullptr,
+            "insert_shared_cache_for_computation",
+            1,
+            "(size_t size, size_t offset);");
+        return nullptr;
+      }
+      size_t size = PyLong_AsSize_t(size_o);
+      size_t offset = PyLong_AsSize_t(offset_o);
+
+      c10::cuda::CUDACachingAllocator::insertSharedCache(size, offset);
+  END_HANDLE_TH_ERRORS
+  Py_RETURN_NONE;
+}
+
+// CoGNN
+PyObject * THCPModule_clearSharedCache(PyObject *_unused, PyObject *noargs)
+{
+  HANDLE_TH_ERRORS
+  c10::cuda::CUDACachingAllocator::clearSharedCache();
+  END_HANDLE_TH_ERRORS
+  Py_RETURN_NONE;
+}
+
+PyObject * THCPModule_memoryStats(PyObject *_unused, PyObject *arg)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(THPUtils_checkLong(arg), "invalid argument to memory_allocated");
+  const int device = (int) THPUtils_unpackLong(arg);
+
+  using c10::cuda::CUDACachingAllocator::StatType;
+  using c10::cuda::CUDACachingAllocator::Stat;
+  using c10::cuda::CUDACachingAllocator::StatArray;
+  using c10::cuda::CUDACachingAllocator::DeviceStats;
+
+  const auto statToDict = [](const Stat& stat) {
+    py::dict dict;
+
+    dict["current"] = stat.current;
+    dict["peak"] = stat.peak;
+    dict["allocated"] = stat.allocated;
+    dict["freed"] = stat.freed;
+    return dict;
+  };
+
+  const auto statArrayToDict = [=](const StatArray& statArray) {
+    const std::array<const char*, static_cast<size_t>(StatType::NUM_TYPES)> statTypeNames = {
+      "all", "small_pool", "large_pool"
+    };
+    py::dict dict;
+    for (size_t i = 0; i < statTypeNames.size(); ++i) {
+      dict[statTypeNames[i]] = statToDict(statArray[i]);
+    }
+    return dict;
+  };
+
+  const DeviceStats stats = c10::cuda::CUDACachingAllocator::getDeviceStats(device);
+
+  py::dict result;
+  result["num_alloc_retries"] = stats.num_alloc_retries;
+  result["num_ooms"] = stats.num_ooms;
+  result["allocation"] = statArrayToDict(stats.allocation);
+  result["segment"] = statArrayToDict(stats.segment);
+  result["active"] = statArrayToDict(stats.active);
+  result["inactive_split"] = statArrayToDict(stats.inactive_split);
+  result["allocated_bytes"] = statArrayToDict(stats.allocated_bytes);
+  result["reserved_bytes"] = statArrayToDict(stats.reserved_bytes);
+  result["active_bytes"] = statArrayToDict(stats.active_bytes);
+  result["inactive_split_bytes"] = statArrayToDict(stats.inactive_split_bytes);
+
+  return result.release().ptr();
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_resetAccumulatedMemoryStats(PyObject *_unused, PyObject *arg)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(THPUtils_checkLong(arg), "invalid argument to reset_accumulated_memory_stats");
+  const int device = (int) THPUtils_unpackLong(arg);
+  c10::cuda::CUDACachingAllocator::resetAccumulatedStats(device);
+  END_HANDLE_TH_ERRORS
+  Py_RETURN_NONE;
+}
+
+PyObject * THCPModule_resetPeakMemoryStats(PyObject *_unused, PyObject *arg)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(THPUtils_checkLong(arg), "invalid argument to reset_peak_memory_stats");
+  const int device = (int) THPUtils_unpackLong(arg);
+  c10::cuda::CUDACachingAllocator::resetPeakStats(device);
+  END_HANDLE_TH_ERRORS
+  Py_RETURN_NONE;
+}
+
+PyObject * THCPModule_memorySnapshot(PyObject *_unused, PyObject *noargs)
+{
+  HANDLE_TH_ERRORS
+
+  using c10::cuda::CUDACachingAllocator::SegmentInfo;
+  using c10::cuda::CUDACachingAllocator::BlockInfo;
+
+  const auto segmentInfoToDict = [](const SegmentInfo& segmentInfo) {
+    py::dict segmentDict;
+    segmentDict["device"] = segmentInfo.device;
+    segmentDict["address"] = segmentInfo.address;
+    segmentDict["total_size"] = segmentInfo.total_size;
+    segmentDict["allocated_size"] = segmentInfo.allocated_size;
+    segmentDict["active_size"] = segmentInfo.active_size;
+    segmentDict["segment_type"] = (segmentInfo.is_large ? "large" : "small");
+
+    py::list blocks;
+    for (const auto& blockInfo : segmentInfo.blocks) {
+      py::dict blockDict;
+      blockDict["size"] = blockInfo.size;
+      blockDict["state"] = (blockInfo.allocated ? "active_allocated" : (blockInfo.active ? "active_pending_free" : "inactive"));
+      blocks.append(blockDict);
+    }
+    segmentDict["blocks"] = blocks;
+
+    return segmentDict;
+  };
+
+  const std::vector<SegmentInfo>& snapshot = c10::cuda::CUDACachingAllocator::snapshot();
+  py::list result;
+
+  for (const auto& segmentInfo : snapshot) {
+    result.append(segmentInfoToDict(segmentInfo));
+  }
+
+  return result.release().ptr();
+  END_HANDLE_TH_ERRORS
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Cuda module initialization
+////////////////////////////////////////////////////////////////////////////////
+
+static void registerCudaDeviceProperties(PyObject* module) {
+  // Add _cudaDevicePropertires class to torch._C
+  auto m = py::handle(module).cast<py::module>();
+  py::class_<cudaDeviceProp>(m, "_CudaDeviceProperties")
+    .def_readonly("name", &cudaDeviceProp::name)
+    .def_readonly("major", &cudaDeviceProp::major)
+    .def_readonly("minor", &cudaDeviceProp::minor)
+    .def_readonly("is_multi_gpu_board", &cudaDeviceProp::isMultiGpuBoard)
+    .def_readonly("is_integrated", &cudaDeviceProp::integrated)
+    .def_readonly("multi_processor_count", &cudaDeviceProp::multiProcessorCount)
+    .def_readonly("total_memory", &cudaDeviceProp::totalGlobalMem)
+    .def("__repr__", [](const cudaDeviceProp &prop) {
+      std::ostringstream stream;
+      stream << "_CudaDeviceProperties(name='" << prop.name << "', major=" << prop.major
+             << ", minor=" << prop.minor << ", total_memory=" << prop.totalGlobalMem / (1024 * 1024)
+             << "MB, multi_processor_count=" << prop.multiProcessorCount << ")";
+      return stream.str();
+    });
+}
+
+static void bindGetDeviceProperties(PyObject* module) {
+  // Add method to torch.cuda
+  auto m = py::handle(module).cast<py::module>();
+  m.def("_get_device_properties", [](int device) -> cudaDeviceProp * {
+    return at::cuda::getDeviceProperties(device);
+  }, py::return_value_policy::reference);
+}
+
+// Callback for python part. Used for additional initialization of python classes
+static PyObject * THCPModule_initExtension(PyObject *self, PyObject *noargs)
+{
+#if C10_ASAN_ENABLED
+  TORCH_WARN(
+    "torch.cuda: your pytorch binary has address sanitizer (asan) built in, "
+    "asan is currently not compatible with torch.cuda module, "
+    "you might get unexpected behavior (eg. out of memory, crash, etc.), "
+    "please rebuild pytorch without asan if you need to use this module");
+#endif
+  HANDLE_TH_ERRORS
+  TORCH_INTERNAL_ASSERT(!in_bad_fork);  // Handled at python level
+  poison_fork();
+  state = at::globalContext().lazyInitCUDA();
+
+  auto m = THPObjectPtr(PyImport_ImportModule("torch.cuda"));
+  if (!m) throw python_error();
+
+  // Register Storage Python objects with DynamicTypes.cpp
+  THCPDoubleStorage_postInit(m);
+  THCPFloatStorage_postInit(m);
+  THCPHalfStorage_postInit(m);
+  THCPLongStorage_postInit(m);
+  THCPIntStorage_postInit(m);
+  THCPShortStorage_postInit(m);
+  THCPCharStorage_postInit(m);
+  THCPByteStorage_postInit(m);
+  THCPBoolStorage_postInit(m);
+  THCPBFloat16Storage_postInit(m);
+  THCPComplexDoubleStorage_postInit(m);
+  THCPComplexFloatStorage_postInit(m);
+
+  bool has_half = true;
+
+  auto set_module_attr = [&](const char* name, PyObject* v) {
+    // PyObject_SetAttrString doesn't steal reference. So no need to incref.
+    if (PyObject_SetAttrString(m, name, v) < 0) {
+      throw python_error();
+    }
+  };
+
+  set_module_attr("has_magma", at::hasMAGMA() ? Py_True : Py_False);
+  set_module_attr("has_half", has_half ? Py_True : Py_False);
+
+  auto _state_cdata = THPObjectPtr(PyLong_FromVoidPtr(state));
+  if (!_state_cdata) throw python_error();
+  set_module_attr("_state_cdata", _state_cdata.get());
+
+  auto num_gpus = c10::cuda::device_count();
+  auto default_cuda_generators = PyTuple_New(static_cast<Py_ssize_t>(num_gpus));
+  for(int i = 0; i < num_gpus; i++) {
+    auto gen = at::cuda::detail::getDefaultCUDAGenerator(i);
+    auto cast_gen = (THPGenerator*)THPGenerator_initDefaultGenerator(gen);
+    // This reference is meant to be given away, so no need to incref here.
+    PyTuple_SetItem(default_cuda_generators, i, (PyObject*)cast_gen);
+  }
+  set_module_attr("default_generators", default_cuda_generators);
+  bindGetDeviceProperties(m);
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject * THCPModule_getCurrentBlasHandle_wrap(PyObject *self, PyObject *noargs)
+{
+  HANDLE_TH_ERRORS
+  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
+  return PyLong_FromVoidPtr(handle);
+  END_HANDLE_TH_ERRORS
+}
+
+static struct PyMethodDef _THCPModule_methods[] = {
+  {"_cuda_init",        THCPModule_initExtension,    METH_NOARGS,  nullptr},
+  {"_cuda_setDevice",   THCPModule_setDevice_wrap,   METH_O,       nullptr},
+  {"_cuda_getDevice",   THCPModule_getDevice_wrap,   METH_NOARGS,  nullptr},
+  {"_cuda_getDeviceCount", THCPModule_getDeviceCount_wrap, METH_NOARGS, nullptr},
+  {"_cuda_canDeviceAccessPeer", THCPModule_canDeviceAccessPeer_wrap, METH_VARARGS, nullptr},
+  {"_cuda_getArchFlags", THCPModule_getArchFlags, METH_NOARGS, nullptr},
+  {"_cuda_isInBadFork", THCPModule_isInBadFork, METH_NOARGS, nullptr},
+  {"_cuda_getCurrentStream",
+    THCPModule_getCurrentStream_wrap, METH_O, nullptr},
+  {"_cuda_getDefaultStream",
+    THCPModule_getDefaultStream_wrap, METH_O, nullptr},
+  {"_cuda_getCurrentBlasHandle", THCPModule_getCurrentBlasHandle_wrap, METH_NOARGS, nullptr},
+  {"_cuda_setStream",    THCPModule_setStream_wrap,  METH_O, nullptr},
+  {"_cuda_getCompiledVersion", THCPModule_getCompiledVersion, METH_NOARGS, nullptr},
+  {"_cuda_hasPrimaryContext", THCPModule_hasPrimaryContext,  METH_O,  nullptr},
+  {"_cuda_setMemoryFraction", THCPModule_setMemoryFraction, METH_VARARGS,  nullptr},
+  {"_cuda_emptyCache", THCPModule_emptyCache, METH_NOARGS, nullptr},
+  {"_cuda_allocateSharedCache", THCPModule_allocateSharedCache, METH_NOARGS, nullptr}, // CoGNN
+  {"_cuda_sendSharedCache", THCPModule_sendSharedCache, METH_NOARGS, nullptr}, // CoGNN
+  {"_cuda_recvSharedCache", THCPModule_recvSharedCache, METH_NOARGS, nullptr}, // CoGNN
+  {"_cuda_insertSharedCacheForComputation", THCPModule_insertSharedCacheForComputation, METH_VARARGS, nullptr}, // CoGNN
+  {"_cuda_clearSharedCache", THCPModule_clearSharedCache, METH_NOARGS, nullptr}, // CoGNN
+  {"_cuda_memoryStats", THCPModule_memoryStats, METH_O, nullptr},
+  {"_cuda_resetAccumulatedMemoryStats", THCPModule_resetAccumulatedMemoryStats, METH_O, nullptr},
+  {"_cuda_resetPeakMemoryStats", THCPModule_resetPeakMemoryStats, METH_O,  nullptr},
+  {"_cuda_memorySnapshot", THCPModule_memorySnapshot, METH_NOARGS, nullptr},
+  {"_cuda_cudaHostAllocator", THCPModule_cudaHostAllocator, METH_NOARGS, nullptr},
+  {"_cuda_cudaCachingAllocator_raw_alloc", THCPModule_cudaCachingAllocator_raw_alloc, METH_VARARGS, nullptr},
+  {"_cuda_cudaCachingAllocator_raw_delete", THCPModule_cudaCachingAllocator_raw_delete, METH_O, nullptr},
+  {"_cuda_synchronize", THCPModule_cudaSynchronize, METH_NOARGS, nullptr},
+  {"_cuda_ipc_collect", THCPModule_cudaIPCCollect, METH_NOARGS, nullptr},
+  {"_cuda_sleep", THCPModule_cudaSleep, METH_O, nullptr},
+  {"_cuda_lock_mutex",   THCPModule_cudaLockMutex,   METH_NOARGS,  nullptr},
+  {"_cuda_unlock_mutex", THCPModule_cudaUnlockMutex, METH_NOARGS,  nullptr},
+#ifdef USE_NCCL
+  {"_nccl_version", THCPModule_nccl_version, METH_NOARGS, nullptr},
+  {"_nccl_unique_id", THCPModule_nccl_unique_id, METH_NOARGS, nullptr},
+  {"_nccl_init_rank", THCPModule_nccl_init_rank, METH_VARARGS, nullptr},
+  {"_nccl_reduce", THCPModule_nccl_reduce, METH_VARARGS, nullptr},
+  {"_nccl_all_reduce", THCPModule_nccl_all_reduce, METH_VARARGS, nullptr},
+  {"_nccl_broadcast", THCPModule_nccl_broadcast, METH_VARARGS, nullptr},
+  {"_nccl_all_gather", THCPModule_nccl_all_gather, METH_VARARGS, nullptr},
+  {"_nccl_reduce_scatter", THCPModule_nccl_reduce_scatter, METH_VARARGS, nullptr},
+#endif
+  {nullptr}
+};
+
+PyMethodDef* THCPModule_methods() {
+  return _THCPModule_methods;
+}
+
+namespace torch { namespace cuda {
+
+namespace shared {
+
+void initCudartBindings(PyObject* module);
+void initNvtxBindings(PyObject* module);
+#if defined(USE_CUDNN) || defined(__HIP_PLATFORM_HCC__)
+void initCudnnBindings(PyObject* module);
+#endif
+
+} // namespace shared
+
+void initModule(PyObject *module) {
+  python::initCommMethods(module);
+  // As weird as it seems, this file is also compiled for ROCm,
+  // so this condition might not always be true...
+  shared::initCudartBindings(module);
+  shared::initNvtxBindings(module);
+#if defined(USE_CUDNN) || defined(__HIP_PLATFORM_HCC__)
+  shared::initCudnnBindings(module);
+#endif
+  registerCudaDeviceProperties(module);
+}
+
+}}
diff --git a/software/plugin/README.md b/software/plugin/README.md
new file mode 100644
index 0000000..64aa7d9
--- /dev/null
+++ b/software/plugin/README.md
@@ -0,0 +1,19 @@
+# Plugin
+
+This folder contains modifications to the PyTorch code (PyTorch v1.8.1). The modified PyTorch is for `CoGNN` or `PipeSwitch` server.
+
+# PyTorch files
+
+- memory.py: $PYTORCH\_PATH/torch/cuda/memory.py
+
+- Module.cpp: $PYTORCH\_PATH/torch/csrc/cuda/Module.cpp
+
+- CUDACachingAllocator.cpp: $PYTORCH\_PATH/c10/cuda/CUDACachingAllocator.cpp
+
+- CUDACachingAllocator.h: $PYTORCH\_PATH/c10/cuda/CUDACachingAllocator.h
+
+# Implementation
+
+All the modified functions are labeled with a comment of "CoGNN" around them.
+
+The code is implemented for NVIDIA V100, which have 32 GB GPU memory. Thus, some related parameters for memory management are directly written in the code. If you use GPUs with different GPU memory size, you need to change these parameters.
diff --git a/software/plugin/memory.py b/software/plugin/memory.py
new file mode 100644
index 0000000..50e5f1e
--- /dev/null
+++ b/software/plugin/memory.py
@@ -0,0 +1,557 @@
+import collections
+import contextlib
+import warnings
+from typing import Any, Dict, Union
+
+import torch
+from . import is_initialized, _get_device_index, _lazy_init
+from torch.types import Device
+
+def _host_allocator():
+    _lazy_init()
+    return torch._C._cuda_cudaHostAllocator()
+
+
+@contextlib.contextmanager
+def _free_mutex():
+    torch._C._cuda_lock_mutex()
+    try:
+        yield
+    finally:
+        torch._C._cuda_unlock_mutex()
+
+
+def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None):
+    r"""Performs a memory allocation using the CUDA memory allocator.
+
+    Memory is allocated for a given device and a stream, this
+    function is intended to be used for interoperability with other
+    frameworks. Allocated memory is released through
+    :func:`~torch.cuda.caching_allocator_delete`.
+
+    Args:
+        size (int): number of bytes to be allocated.
+        device (torch.device or int, optional): selected device. If it is
+            ``None`` the default CUDA device is used.
+        stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then
+            the default stream for the selected device is used.
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    if device is None:
+        device = torch.cuda.current_device()
+    device = _get_device_index(device)
+    if stream is None:
+        stream = torch.cuda.current_stream(device)
+    if isinstance(stream, torch.cuda.streams.Stream):
+        stream = stream.cuda_stream
+    if not isinstance(stream, int):
+        raise TypeError('Invalid type for stream argument, must be '
+                        '`torch.cuda.Stream` or `int` representing a pointer '
+                        'to a exisiting stream')
+    with torch.cuda.device(device):
+        return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream)
+
+
+def caching_allocator_delete(mem_ptr):
+    r"""Deletes memory allocated using the CUDA memory allocator.
+
+    Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`.
+    is freed here. The associated device and stream are tracked inside
+    the allocator.
+
+    Args:
+        mem_ptr (int): memory address to be freed by the allocator.
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr)
+
+
+def set_per_process_memory_fraction(fraction, device: Union[Device, int] = None) -> None:
+    r"""Set memory fraction for a process.
+    The fraction is used to limit an caching allocator to allocated memory on a CUDA device.
+    The allowed value equals the total visible memory multiplied fraction.
+    If trying to allocate more than the allowed value in a process, will raise an out of
+    memory error in allocator.
+
+    Args:
+        fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction.
+        device (torch.device or int, optional): selected device. If it is
+            ``None`` the default CUDA device is used.
+    .. note::
+        In general, the total available free memory is less than the total capacity.
+    """
+    _lazy_init()
+    if device is None:
+        device = torch.cuda.current_device()
+    device = _get_device_index(device)
+    if not isinstance(fraction, float):
+        raise TypeError('Invalid type for fraction argument, must be `float`')
+    if fraction < 0 or fraction > 1:
+        raise ValueError('Invalid fraction value: {}. '
+                         'Allowed range: 0~1'.format(fraction))
+
+    torch._C._cuda_setMemoryFraction(fraction, device)
+
+
+def empty_cache() -> None:
+    r"""Releases all unoccupied cached memory currently held by the caching
+    allocator so that those can be used in other GPU application and visible in
+    `nvidia-smi`.
+
+    .. note::
+        :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU
+        memory available for PyTorch. However, it may help reduce fragmentation
+        of GPU memory in certain cases. See :ref:`cuda-memory-management` for
+        more details about GPU memory management.
+    """
+    if is_initialized():
+        torch._C._cuda_emptyCache()
+
+# CoGNN
+def allocate_shared_cache() -> None:
+    if is_initialized():
+        torch._C._cuda_allocateSharedCache()
+
+# CoGNN
+def send_shared_cache() -> None:
+    if is_initialized():
+        torch._C._cuda_sendSharedCache()
+
+# CoGNN
+def recv_shared_cache() -> None:
+    if is_initialized():
+        torch._C._cuda_recvSharedCache()
+
+# CoGNN
+def insert_shared_cache_for_computation(size, offset) -> None:
+    if is_initialized():
+        torch._C._cuda_insertSharedCacheForComputation(size, offset)
+
+# CoGNN
+def clear_shared_cache() -> None:
+    if is_initialized():
+        torch._C._cuda_clearSharedCache()
+
+def memory_stats(device: Union[Device, int] = None) -> Dict[str, Any]:
+    r"""Returns a dictionary of CUDA memory allocator statistics for a
+    given device.
+
+    The return value of this function is a dictionary of statistics, each of
+    which is a non-negative integer.
+
+    Core statistics:
+
+    - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of allocation requests received by the memory allocator.
+    - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of allocated memory.
+    - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of reserved segments from ``cudaMalloc()``.
+    - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of reserved memory.
+    - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of active memory blocks.
+    - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of active memory.
+    - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of inactive, non-releasable memory blocks.
+    - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of inactive, non-releasable memory.
+
+    For these core statistics, values are broken down as follows.
+
+    Pool type:
+
+    - ``all``: combined statistics across all memory pools.
+    - ``large_pool``: statistics for the large allocation pool
+      (as of October 2019, for size >= 1MB allocations).
+    - ``small_pool``: statistics for the small allocation pool
+      (as of October 2019, for size < 1MB allocations).
+
+    Metric type:
+
+    - ``current``: current value of this metric.
+    - ``peak``: maximum value of this metric.
+    - ``allocated``: historical total increase in this metric.
+    - ``freed``: historical total decrease in this metric.
+
+    In addition to the core statistics, we also provide some simple event
+    counters:
+
+    - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that
+      result in a cache flush and retry.
+    - ``"num_ooms"``: number of out-of-memory errors thrown.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistics for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    result = []
+
+    def _recurse_add_to_result(prefix, obj):
+        if isinstance(obj, dict):
+            if len(prefix) > 0:
+                prefix += "."
+            for k, v in obj.items():
+                _recurse_add_to_result(prefix + k, v)
+        else:
+            result.append((prefix, obj))
+
+    stats = memory_stats_as_nested_dict(device=device)
+    _recurse_add_to_result("", stats)
+    result.sort()
+
+    return collections.OrderedDict(result)
+
+
+def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> Dict[str, Any]:
+    r"""Returns the result of :func:`~torch.cuda.memory_stats` as a nested dictionary."""
+    if not is_initialized():
+        return {}
+    device = _get_device_index(device, optional=True)
+    return torch._C._cuda_memoryStats(device)
+
+
+def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None:
+    r"""Resets the "accumulated" (historical) stats tracked by the CUDA memory allocator.
+
+    See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to
+    the `"allocated"` and `"freed"` keys in each individual stat dict, as well as
+    `"num_alloc_retries"` and `"num_ooms"`.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    device = _get_device_index(device, optional=True)
+    return torch._C._cuda_resetAccumulatedMemoryStats(device)
+
+
+def reset_peak_memory_stats(device: Union[Device, int] = None) -> None:
+    r"""Resets the "peak" stats tracked by the CUDA memory allocator.
+
+    See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the
+    `"peak"` key in each individual stat dict.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    device = _get_device_index(device, optional=True)
+    return torch._C._cuda_resetPeakMemoryStats(device)
+
+
+def reset_max_memory_allocated(device: Union[Device, int] = None) -> None:
+    r"""Resets the starting point in tracking maximum GPU memory occupied by
+    tensors for a given device.
+
+    See :func:`~torch.cuda.max_memory_allocated` for details.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. warning::
+        This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets
+        /all/ peak memory stats.
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    warnings.warn(
+        "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, "
+        "which resets /all/ peak memory stats.",
+        FutureWarning)
+    return reset_peak_memory_stats(device=device)
+
+
+def reset_max_memory_cached(device: Union[Device, int] = None) -> None:
+    r"""Resets the starting point in tracking maximum GPU memory managed by the
+    caching allocator for a given device.
+
+    See :func:`~torch.cuda.max_memory_cached` for details.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. warning::
+        This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets
+        /all/ peak memory stats.
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    warnings.warn(
+        "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, "
+        "which resets /all/ peak memory stats.",
+        FutureWarning)
+    return reset_peak_memory_stats(device=device)
+
+
+def memory_allocated(device: Union[Device, int] = None) -> int:
+    r"""Returns the current GPU memory occupied by tensors in bytes for a given
+    device.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        This is likely less than the amount shown in `nvidia-smi` since some
+        unused memory can be held by the caching allocator and some context
+        needs to be created on GPU. See :ref:`cuda-memory-management` for more
+        details about GPU memory management.
+    """
+    return memory_stats(device=device).get("allocated_bytes.all.current", 0)
+
+
+def max_memory_allocated(device: Union[Device, int] = None) -> int:
+    r"""Returns the maximum GPU memory occupied by tensors in bytes for a given
+    device.
+
+    By default, this returns the peak allocated memory since the beginning of
+    this program. :func:`~torch.cuda.reset_peak_stats` can be used to
+    reset the starting point in tracking this metric. For example, these two
+    functions can measure the peak allocated memory usage of each iteration in a
+    training loop.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    return memory_stats(device=device).get("allocated_bytes.all.peak", 0)
+
+
+def memory_reserved(device: Union[Device, int] = None) -> int:
+    r"""Returns the current GPU memory managed by the caching allocator in bytes
+    for a given device.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    return memory_stats(device=device).get("reserved_bytes.all.current", 0)
+
+
+def max_memory_reserved(device: Union[Device, int] = None) -> int:
+    r"""Returns the maximum GPU memory managed by the caching allocator in bytes
+    for a given device.
+
+    By default, this returns the peak cached memory since the beginning of this
+    program. :func:`~torch.cuda.reset_peak_stats` can be used to reset
+    the starting point in tracking this metric. For example, these two functions
+    can measure the peak cached memory amount of each iteration in a training
+    loop.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    return memory_stats(device=device).get("reserved_bytes.all.peak", 0)
+
+
+def memory_cached(device: Union[Device, int] = None) -> int:
+    r"""Deprecated; see :func:`~torch.cuda.memory_reserved`."""
+    warnings.warn(
+        "torch.cuda.memory_cached has been renamed to torch.cuda.memory_reserved",
+        FutureWarning)
+    return memory_reserved(device=device)
+
+
+def max_memory_cached(device: Union[Device, int] = None) -> int:
+    r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`."""
+    warnings.warn(
+        "torch.cuda.max_memory_cached has been renamed to torch.cuda.max_memory_reserved",
+        FutureWarning)
+    return max_memory_reserved(device=device)
+
+
+def memory_snapshot():
+    r"""Returns a snapshot of the CUDA memory allocator state across all devices.
+
+    Interpreting the output of this function requires familiarity with the
+    memory allocator internals.
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    return torch._C._cuda_memorySnapshot()
+
+
+def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str:
+    r"""Returns a human-readable printout of the current memory allocator
+    statistics for a given device.
+
+    This can be useful to display periodically during training, or when
+    handling out-of-memory exceptions.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            printout for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+        abbreviated (bool, optional): whether to return an abbreviated summary
+            (default: False).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    device = _get_device_index(device, optional=True)
+    stats = memory_stats(device=device)
+
+    def _format_size(sz, pref_sz):
+        prefixes = ["B ", "KB", "MB", "GB", "TB", "PB"]
+        prefix = prefixes[0]
+        for new_prefix in prefixes[1:]:
+            if pref_sz < 768 * 1024:
+                break
+            prefix = new_prefix
+            sz //= 1024
+            pref_sz /= 1024
+        return "{:7d} {}".format(sz, prefix)
+
+    def _format_count(cnt, pref_cnt):
+        prefixes = [" ", "K", "M"]
+        prefix = prefixes[0]
+        for new_prefix in prefixes[1:]:
+            if pref_cnt < 750 * 1000:
+                break
+            prefix = new_prefix
+            cnt //= 1000
+            pref_cnt /= 1000
+        return "{:7d} {} ".format(cnt, prefix)
+
+    metrics_to_display = [
+        ("allocated_bytes", "Allocated memory", _format_size),
+        ("active_bytes", "Active memory", _format_size),
+        ("reserved_bytes", "GPU reserved memory", _format_size),
+        ("inactive_split_bytes", "Non-releasable memory", _format_size),
+        ("allocation", "Allocations", _format_count),
+        ("active", "Active allocs", _format_count),
+        ("segment", "GPU reserved segments", _format_count),
+        ("inactive_split", "Non-releasable allocs", _format_count),
+    ]
+
+    lines = []
+    lines.append("=" * 75)
+    lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ")
+    lines.append("-" * 75)
+    lines.append("  {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d}  ")
+    lines.append("=" * 75)
+    lines.append("        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  ")
+
+    for metric_key, metric_name, formatter in metrics_to_display:
+        lines.append("-" * 75)
+        submetrics = [("all", metric_name)]
+        if not abbreviated:
+            submetrics.append(("large_pool", "      from large pool"))
+            submetrics.append(("small_pool", "      from small pool"))
+
+        current_prefval, peak_prefval, allocated_prefval, freed_prefval = None, None, None, None
+
+        for submetric_key, submetric_name in submetrics:
+            prefix = metric_key + "." + submetric_key + "."
+
+            current = stats[prefix + "current"]
+            peak = stats[prefix + "peak"]
+            allocated = stats[prefix + "allocated"]
+            freed = stats[prefix + "freed"]
+
+            if current_prefval is None:
+                current_prefval = current
+                peak_prefval = peak
+                allocated_prefval = allocated
+                freed_prefval = freed
+
+            lines.append(" {:<21} | {} | {} | {} | {} ".format(
+                submetric_name,
+                formatter(current, current_prefval),
+                formatter(peak, peak_prefval),
+                formatter(allocated, allocated_prefval),
+                formatter(freed, freed_prefval)),
+            )
+
+    lines.append("=" * 75)
+
+    fmt_dict = {"_": "", "device": device}
+    for k, v in stats.items():
+        fmt_dict[k.replace(".", "-")] = v
+    return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n"
+
+
+def list_gpu_processes(device: Union[Device, int] = None) -> str:
+    r"""Returns a human-readable printout of the running processes
+    and their GPU memory use for a given device.
+
+    This can be useful to display periodically during training, or when
+    handling out-of-memory exceptions.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            printout for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+    """
+
+    try:
+        import pynvml  # type: ignore
+    except ModuleNotFoundError:
+        return("pynvml module not found, please install pynvml")
+    from pynvml import NVMLError_DriverNotLoaded
+    try:
+        pynvml.nvmlInit()
+    except NVMLError_DriverNotLoaded:
+        return ("cuda driver can't be loaded, is cuda enabled?")
+    device = _get_device_index(device, optional=True)
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device)
+    procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
+    lines = []
+    lines.append(f"GPU:{device}")
+    if len(procs) == 0:
+        lines.append("no processes are running")
+    for p in procs:
+        mem = p.usedGpuMemory / (1024 * 1024)
+        lines.append(f"process {p.pid:>10d} uses {mem:>12.3f} MB GPU memory")
+    return "\n".join(lines)
diff --git a/software/task/GAT.py b/software/task/GAT.py
new file mode 100644
index 0000000..a6888de
--- /dev/null
+++ b/software/task/GAT.py
@@ -0,0 +1,133 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch_geometric.data import Data
+from torch_geometric.nn import GATConv
+from torch.nn import Sequential, Linear
+from task.common import *
+
+class GAT(nn.Module):
+    def __init__(self,
+                 in_feats,
+                 n_hidden,
+                 n_classes,
+                 n_layers,
+                 dropout):
+        super(GAT, self).__init__()
+        self.layers = nn.ModuleList()
+        self.layers.append(GATConv(in_feats, 8, heads=8, dropout=0.6)) # input layer
+        for i in range(n_layers - 2): # hidden layer
+            self.layers.append(GATConv(8 * 8, 8, heads=8, dropout=0.6))
+        self.layers.append(GATConv(8 * 8, n_classes, heads=1, concat=False, dropout=0.6))
+
+    def forward(self, features, edge_index):
+        x = features
+        for i, layer in enumerate(self.layers):
+            x = layer(x, edge_index)
+        return F.log_softmax(x, dim=-1)
+
+
+def partition_model(model):
+    group_list = [[child] for child in model.children()]
+    return group_list
+
+
+def get_comp_size(graph, para_shape_list):
+    computation_peak_bytes, computation_active_bytes = 0, 0
+
+    num_nodes, num_edges, feat_length = graph.features.shape[0], graph.edge_index.shape[1], graph.features.shape[1]
+
+    edge_index_size = graph.edge_index.nelement() * graph.edge_index.element_size()
+    feat_size = graph.features.nelement() * graph.features.element_size()
+    label_size = graph.labels.nelement() * graph.labels.element_size()
+
+    graph_size = [edge_index_size, feat_size, label_size] # edge_index, features and labels
+
+    for size in graph_size:
+        padded_size = calc_pad(size)
+        computation_active_bytes += padded_size
+        computation_peak_bytes += padded_size
+    
+    # add self loops from PyG
+    mask = graph.edge_index[0] == graph.edge_index[1]
+    add_loop_num_edges = num_nodes + num_edges - torch.count_nonzero(mask).item()
+    padded_loop_edge_index_size = calc_pad(edge_index_size * add_loop_num_edges / num_edges)
+
+    for i in range(len(para_shape_list[0])):
+        shape = para_shape_list[0][i]
+        heads = 8   # hidden layers
+        if i == len(para_shape_list[0]) - 1: # output layer
+            back_lin_output_size = padded_lin_output_size
+            back_edge_alpha_size = padded_edge_alpha_size
+            heads = 1
+
+        # calculate lin and alpha output
+        padded_lin_output_size, padded_alpha_size = calc_pad(num_nodes * shape[0] * 4), calc_pad(num_nodes * heads * 4)
+
+        computation_active_bytes += 2 * (padded_lin_output_size + padded_alpha_size)
+        if computation_active_bytes > computation_peak_bytes:
+            computation_peak_bytes = computation_active_bytes
+        computation_active_bytes -= padded_lin_output_size
+
+        # add self loop to edge_index
+        computation_active_bytes += padded_loop_edge_index_size
+        if computation_active_bytes > computation_peak_bytes:
+            computation_peak_bytes = computation_active_bytes
+
+        # propagate node features to edges
+        edge_feat_size, edge_alpha_size = add_loop_num_edges * shape[0] * 4, add_loop_num_edges * heads * 4
+        padded_edge_feat_size, padded_edge_alpha_size = calc_pad(edge_feat_size), calc_pad(edge_alpha_size)
+        computation_active_bytes += padded_edge_feat_size + 2 * padded_edge_alpha_size
+        if computation_active_bytes > computation_peak_bytes:
+            computation_peak_bytes = computation_active_bytes
+
+        # process alpha and generate message (leaky_relu)
+        computation_active_bytes += 2 * padded_edge_alpha_size 
+        if computation_active_bytes > computation_peak_bytes:
+            computation_peak_bytes = computation_active_bytes
+
+        # softmax details
+        padded_add_alpha_size = calc_pad((add_loop_num_edges - num_edges) * heads * 4)
+
+        computation_active_bytes += 2 * padded_edge_alpha_size # scatter/index_select
+        if computation_active_bytes > computation_peak_bytes:
+            computation_peak_bytes = computation_active_bytes
+        computation_active_bytes -= padded_add_alpha_size
+
+        computation_active_bytes += 2 * padded_edge_alpha_size # exp
+        if computation_active_bytes > computation_peak_bytes:
+            computation_peak_bytes = computation_active_bytes
+        computation_active_bytes -= padded_edge_alpha_size
+
+        computation_active_bytes += padded_edge_alpha_size # scatter/index_select
+        if computation_active_bytes > computation_peak_bytes:
+            computation_peak_bytes = computation_active_bytes
+        computation_active_bytes -= (padded_edge_alpha_size - padded_add_alpha_size)
+        computation_active_bytes += padded_edge_alpha_size
+        if computation_active_bytes > computation_peak_bytes:
+            computation_peak_bytes = computation_active_bytes
+        computation_active_bytes -= padded_add_alpha_size
+
+        computation_active_bytes += 2 * padded_edge_alpha_size # softmax return
+        if computation_active_bytes > computation_peak_bytes:
+            computation_peak_bytes = computation_active_bytes
+        computation_active_bytes -= 3 * padded_edge_alpha_size
+
+        # aggregation and update
+        computation_active_bytes += padded_edge_feat_size + padded_lin_output_size
+        if computation_active_bytes > computation_peak_bytes:
+            computation_peak_bytes = computation_active_bytes
+        computation_active_bytes -= (padded_edge_feat_size + 2 * (padded_edge_alpha_size + padded_alpha_size))
+
+    # output gradient - backward
+    gradient_output_size = 2 * back_lin_output_size + 16 * back_edge_alpha_size
+
+    computation_active_bytes += gradient_output_size
+    if computation_active_bytes > computation_peak_bytes:
+        computation_peak_bytes = computation_active_bytes
+    computation_active_bytes -= gradient_output_size # release gradients
+
+    # multiply the threshold to ensure safe memory
+    computation_peak_bytes = calc_pad(int(computation_peak_bytes * comp_ratio))
+
+    return computation_peak_bytes
diff --git a/software/task/GAT_training.py b/software/task/GAT_training.py
new file mode 100644
index 0000000..464f9ee
--- /dev/null
+++ b/software/task/GAT_training.py
@@ -0,0 +1,76 @@
+import time
+import threading
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchsummary import summary
+from torch_geometric.nn import GATConv
+
+from task.common import *
+from task.GAT import *
+
+def import_func(model, graph):
+    def train(model, graph):
+        print(threading.currentThread().getName(),
+            'GAT training {} >>>>>>>>>>'.format(graph.name), time.time())
+        before_time = time.time()
+        model = model.cuda()
+        features = graph.features.cuda(non_blocking=True)
+        labels = graph.labels.cuda(non_blocking=True)
+        edge_index = graph.edge_index.to(gpu, non_blocking=True)
+
+        optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
+        loss = None
+        print('before training')
+        for epoch in range(num_epochs):
+            model.train()
+            output = model(features, edge_index)
+            loss = F.nll_loss(output, labels)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()       
+#            print('GAT {} Epoch {:05d}'.format(graph.name, epoch))
+#        training_time = time.time() - before_time    
+#        return training_time * 1000
+        return loss.item()
+
+    return train
+
+
+def import_model(data, num_layers):
+    # feats and classes of graph data
+    feat_dim, num_classes = 32, 2
+    for idata, ifeat, iclasses in gnndatasets:
+        if idata == data:
+            feat_dim, num_classes = ifeat, iclasses
+            break
+    
+    # load graph data
+    name, edge_index, features, labels = load_graph_data(data)
+    graph = GraphSummary(name, edge_index, features, labels)
+
+    model = GAT(feat_dim, hidden_dim, num_classes, num_layers, 0.5)
+
+    # set full name to disguish
+    FULL_NAME = 'GAT_{}'.format(data)
+    set_fullname(model, FULL_NAME)
+
+    return model, graph
+
+
+def import_task(data, num_layers):
+    model, graph = import_model(data, num_layers)
+    func = import_func(model, graph)
+    group_list = partition_model(model)
+    shape_summary_list = [group_to_shape(group) for group in group_list]
+    return model, func, shape_summary_list
+
+
+def import_parameters(data, num_layers):
+    model, graph = import_model(data, num_layers)
+    group_list = partition_model(model)
+    batch_list = [group_to_batch(group) for group in group_list]
+    para_shape_list = [group_to_para_shape(group) for group in group_list]
+    comp_total_bytes = get_comp_size(graph, para_shape_list)
+    return batch_list, comp_total_bytes
diff --git a/software/task/GCN.py b/software/task/GCN.py
new file mode 100644
index 0000000..6ed4768
--- /dev/null
+++ b/software/task/GCN.py
@@ -0,0 +1,77 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch_geometric.data import Data
+from torch_geometric.nn import GCNConv
+from task.common import *
+
+
+class GCN(nn.Module):
+    def __init__(self,
+                 in_feats,
+                 n_hidden,
+                 n_classes,
+                 n_layers,
+                 dropout):
+        super(GCN, self).__init__()
+        self.layers = nn.ModuleList()
+        self.layers.append(GCNConv(in_feats, n_hidden)) # input layer
+        for i in range(n_layers - 2): # hidden layers
+            self.layers.append(GCNConv(n_hidden, n_hidden))
+        self.layers.append(GCNConv(n_hidden, n_classes)) # output layer
+
+    def forward(self, features, edge_index):
+        x = features
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x, edge_index))
+
+        return F.log_softmax(x, dim=-1)
+
+
+def partition_model(model):
+    group_list = [[child] for child in model.children()]
+    return group_list
+
+
+def get_comp_size(graph, para_shape_list):
+    computation_peak_bytes, computation_active_bytes = 0, 0
+
+    num_nodes, num_edges, feat_length = graph.features.shape[0], graph.edge_index.shape[1], graph.features.shape[1]
+
+    edge_index_size = graph.edge_index.nelement() * graph.edge_index.element_size()
+    edge_weight_size = num_edges * 4 # float32
+    feat_size = graph.features.nelement() * graph.features.element_size()
+    label_size = graph.labels.nelement() * graph.labels.element_size()
+
+    graph_size = [edge_index_size, edge_weight_size, feat_size, label_size] # edge_index, features and labels
+
+    for size in graph_size:
+        padded_size = calc_pad(size)
+        computation_active_bytes += padded_size
+        computation_peak_bytes += padded_size
+    
+    # add self loops from PyG
+    mask = graph.edge_index[0] == graph.edge_index[1]
+    add_loop_num_edges = num_nodes + num_edges - torch.count_nonzero(mask).item()
+    loop_edge_index_size = edge_index_size * add_loop_num_edges / num_edges
+    loop_edge_weight_size = edge_weight_size * add_loop_num_edges / num_edges
+    padded_loop_edge_index_size, padded_loop_edge_weight_size = calc_pad(loop_edge_index_size), calc_pad(loop_edge_weight_size)
+    
+    for shape in para_shape_list[0]: # assuming float32 type (output/features)
+        if len(shape) == 2: # 2-D weights
+            padded_output_size = calc_pad(num_nodes * shape[1] * 4)
+            computation_active_bytes += padded_output_size + padded_loop_edge_index_size + padded_loop_edge_weight_size # after GCN layer
+
+            # propagate node features to edges
+            edge_feat_size = (feat_size * add_loop_num_edges * 2 / num_nodes) * shape[1] / feat_length
+            padded_edge_feat_size = calc_pad(edge_feat_size)
+
+            computation_active_bytes += padded_edge_feat_size + padded_output_size # during propagation
+            if computation_active_bytes > computation_peak_bytes:
+                computation_peak_bytes = computation_active_bytes
+            computation_active_bytes -= padded_edge_feat_size # only save features
+
+    # multiply the threshold to ensure safe memory
+    computation_peak_bytes = calc_pad(int(computation_peak_bytes * comp_ratio))
+
+    return computation_peak_bytes
diff --git a/software/task/GCN_training.py b/software/task/GCN_training.py
new file mode 100644
index 0000000..74b9fb9
--- /dev/null
+++ b/software/task/GCN_training.py
@@ -0,0 +1,76 @@
+import time
+import threading
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchsummary import summary
+from torch_geometric.nn import GCNConv
+
+from task.common import *
+from task.GCN import *
+
+def import_func(model, graph):
+    def train(model, graph):
+        print(threading.currentThread().getName(),
+            'GCN training {} >>>>>>>>>>'.format(graph.name), time.time())
+        before_time = time.time()
+        model = model.cuda()
+        features = graph.features.cuda(non_blocking=True)
+        labels = graph.labels.cuda(non_blocking=True)
+        edge_index = graph.edge_index.to(gpu, non_blocking=True)
+
+        optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
+        loss = None
+        for epoch in range(num_epochs):
+            model.train()
+            output = model(features, edge_index)
+            loss = F.nll_loss(output, labels)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+#            print('GCN {} Epoch {:05d}'.format(graph.name, epoch))
+#        training_time = time.time() - before_time    
+#        return training_time * 1000
+
+        return loss.item()
+
+    return train
+
+
+def import_model(data, num_layers):
+    # feats and classes of graph data
+    feat_dim, num_classes = 32, 2
+    for idata, ifeat, iclasses in gnndatasets:
+        if idata == data:
+            feat_dim, num_classes = ifeat, iclasses
+            break
+    
+    # load graph data
+    name, edge_index, features, labels = load_graph_data(data)
+    graph = GraphSummary(name, edge_index, features, labels)
+
+    model = GCN(feat_dim, hidden_dim, num_classes, num_layers, 0.5)
+
+    # set full name to disguish
+    FULL_NAME = 'GCN_{}'.format(data)
+    set_fullname(model, FULL_NAME)
+
+    return model, graph
+
+
+def import_task(data, num_layers):
+    model, graph = import_model(data, num_layers)
+    func = import_func(model, graph)
+    group_list = partition_model(model)
+    shape_summary_list = [group_to_shape(group) for group in group_list]
+    return model, func, shape_summary_list
+
+
+def import_parameters(data, num_layers):
+    model, graph = import_model(data, num_layers)
+    group_list = partition_model(model)
+    batch_list = [group_to_batch(group) for group in group_list]
+    para_shape_list = [group_to_para_shape(group) for group in group_list]
+    comp_total_bytes = get_comp_size(graph, para_shape_list)
+    return batch_list, comp_total_bytes
diff --git a/software/task/GIN.py b/software/task/GIN.py
new file mode 100644
index 0000000..72da81e
--- /dev/null
+++ b/software/task/GIN.py
@@ -0,0 +1,115 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch_geometric.data import Data
+from torch.nn import Sequential, Linear, ReLU
+from torch_geometric.nn import GINConv
+from task.common import *
+
+
+class GIN(nn.Module):
+    def __init__(self,
+                 in_feats,
+                 n_hidden,
+                 n_classes,
+                 n_layers,
+                 dropout):
+        super(GIN, self).__init__()
+        self.n_layers = n_layers
+        self.convlayers = nn.ModuleList()
+        self.bnlayers = nn.ModuleList()
+
+        # input layer
+        self.convlayers.append(GINConv(Sequential(Linear(in_feats, n_hidden), ReLU(), Linear(n_hidden, n_hidden))))
+        self.bnlayers.append(torch.nn.BatchNorm1d(n_hidden))
+
+        for i in range(n_layers - 2): # hidden layers
+            self.convlayers.append(GINConv(Sequential(Linear(n_hidden, n_hidden), ReLU(), Linear(n_hidden, n_hidden))))
+            self.bnlayers.append(torch.nn.BatchNorm1d(n_hidden))
+
+        # output layer
+        self.fc1 = Linear(n_hidden, n_hidden)
+        self.fc2 = Linear(n_hidden, n_classes)
+
+    def forward(self, features, edge_index):
+        x = features
+        for i, layer in enumerate(self.convlayers):  
+            x = F.relu(layer(x, edge_index))
+            x = self.bnlayers[i](x)
+        x = F.relu(self.fc1(x))
+        x = self.fc2(x)
+
+        return F.log_softmax(x, dim=-1)
+
+
+def partition_model(model):
+    group_list = []
+    group_list = [[child] for child in model.children()]
+    return group_list
+
+
+def get_comp_size(graph, para_shape_list):
+    computation_peak_bytes, computation_active_bytes = 0, 0
+
+    num_nodes, num_edges, feat_length = graph.features.shape[0], graph.edge_index.shape[1], graph.features.shape[1]
+
+    edge_index_size = graph.edge_index.nelement() * graph.edge_index.element_size()
+    feat_size = graph.features.nelement() * graph.features.element_size()
+    label_size = graph.labels.nelement() * graph.labels.element_size()
+
+    # padded size
+    padded_edge_index_size, padded_feat_size, padded_label_size = calc_pad(edge_index_size), calc_pad(feat_size), calc_pad(label_size)
+    padded_graph_size = [padded_edge_index_size, padded_feat_size, padded_label_size] # edge_index, features and labels
+
+    for padded_size in padded_graph_size:
+        computation_active_bytes += padded_size
+        computation_peak_bytes += padded_size
+
+    gin_counter = 0
+    for shape in para_shape_list[0]: # GINConv Layers
+        if len(shape) == 2 and gin_counter % 4 == 0: # 2-D weights
+            padded_aggr_output_size = calc_pad(num_nodes * shape[1] * 4) # features after aggregation
+
+            # propagate node features to edges
+            edge_feat_size = (feat_size * num_edges / num_nodes) * shape[1] / feat_length
+            padded_edge_feat_size = calc_pad(edge_feat_size)
+            computation_active_bytes += padded_edge_feat_size + padded_aggr_output_size 
+
+            if computation_active_bytes > computation_peak_bytes:
+                computation_peak_bytes = computation_active_bytes
+            computation_active_bytes -= padded_edge_feat_size # only save features
+
+            # outputs - Sequential(Linear, Relu, Linear)
+            padded_lin_output_size = calc_pad(num_nodes * shape[0] * 4)
+
+            if gin_counter == 0: 
+                computation_active_bytes += padded_lin_output_size * 4
+            else:  computation_active_bytes += padded_lin_output_size * 3
+
+            # BatchNorm output
+            computation_active_bytes += padded_lin_output_size
+            if computation_active_bytes > computation_peak_bytes:
+                computation_peak_bytes = computation_active_bytes
+
+        gin_counter += 1
+
+    # output - fc1 layer
+    padded_fc1_output_size = calc_pad(num_nodes * para_shape_list[2][0][0] * 4)
+
+    # output - fc2 layer
+    padded_fc2_output_size = calc_pad(num_nodes * para_shape_list[3][0][0] * 4)
+    
+    computation_active_bytes += (padded_fc1_output_size  + padded_fc2_output_size) * 2
+    if computation_active_bytes > computation_peak_bytes:
+        computation_peak_bytes = computation_active_bytes
+
+    # output gradient - backward
+    computation_active_bytes += padded_fc1_output_size
+    if computation_active_bytes > computation_peak_bytes:
+        computation_peak_bytes = computation_active_bytes
+    computation_active_bytes -= padded_fc1_output_size # release gradients
+    
+    # multiply the threshold to ensure safe memory
+    computation_peak_bytes = calc_pad(int(computation_peak_bytes * comp_ratio))
+
+    return computation_peak_bytes
diff --git a/software/task/GIN_training.py b/software/task/GIN_training.py
new file mode 100644
index 0000000..33c7f24
--- /dev/null
+++ b/software/task/GIN_training.py
@@ -0,0 +1,77 @@
+import time
+import threading
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchsummary import summary
+from torch_geometric.nn import GINConv
+
+from task.common import *
+from task.GIN import *
+
+def import_func(model, graph):
+    def train(model, graph):
+        print(threading.currentThread().getName(),
+            'GIN training {} >>>>>>>>>>'.format(graph.name), time.time())
+        before_time = time.time()
+        model = model.cuda()
+        features = graph.features.cuda(non_blocking=True)
+        labels = graph.labels.cuda(non_blocking=True)
+        edge_index = graph.edge_index.to(gpu, non_blocking=True)
+
+        loss_fcn = torch.nn.CrossEntropyLoss()
+        optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
+        loss = None
+        print('before training')
+        for epoch in range(num_epochs):
+            model.train()
+            output = model(features, edge_index)
+            loss = F.nll_loss(output, labels)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+        training_time = time.time() - before_time
+#            print('GIN {} Epoch {:05d}'.format(graph.name, epoch))
+#        return training_time * 1000
+        return loss.item()
+
+    return train
+
+
+def import_model(data, num_layers):
+    # feats and classes of graph data
+    feat_dim, num_classes = 32, 2
+    for idata, ifeat, iclasses in gnndatasets:
+        if idata == data:
+            feat_dim, num_classes = ifeat, iclasses
+            break
+    
+    # load graph data
+    name, edge_index, features, labels = load_graph_data(data)
+    graph = GraphSummary(name, edge_index, features, labels)
+
+    model = GIN(feat_dim, hidden_dim, num_classes, num_layers, 0.5)
+
+    # set full name to disguish
+    FULL_NAME = 'GIN_{}'.format(data)
+    set_fullname(model, FULL_NAME)
+
+    return model, graph
+
+
+def import_task(data, num_layers):
+    model, graph = import_model(data, num_layers)
+    func = import_func(model, graph)
+    group_list = partition_model(model)
+    shape_summary_list = [group_to_shape(group) for group in group_list]
+    return model, func, shape_summary_list
+
+
+def import_parameters(data, num_layers):
+    model, graph = import_model(data, num_layers)
+    group_list = partition_model(model)
+    batch_list = [group_to_batch(group) for group in group_list]
+    para_shape_list = [group_to_para_shape(group) for group in group_list]
+    comp_total_bytes = get_comp_size(graph, para_shape_list)
+    return batch_list, comp_total_bytes
diff --git a/software/task/GraphSAGE.py b/software/task/GraphSAGE.py
new file mode 100644
index 0000000..ac8c738
--- /dev/null
+++ b/software/task/GraphSAGE.py
@@ -0,0 +1,78 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch_geometric.data import Data
+from torch_geometric.nn import SAGEConv
+from task.common import *
+
+
+class GraphSAGE(nn.Module):
+    def __init__(self,
+                 in_feats,
+                 n_hidden,
+                 n_classes,
+                 n_layers,
+                 dropout):
+        super(GraphSAGE, self).__init__()
+        self.layers = nn.ModuleList()
+        self.layers.append(SAGEConv(in_feats, n_hidden)) # input layer
+        for i in range(n_layers - 2): # hidden layers
+            self.layers.append(SAGEConv(n_hidden, n_hidden))
+        self.layers.append(SAGEConv(n_hidden, n_classes)) # output layer
+
+    def forward(self, features, edge_index):
+        x = features
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x, edge_index))
+
+        return F.log_softmax(x, dim=-1)
+
+
+def partition_model(model):
+    group_list = [[child] for child in model.children()]
+    return group_list
+
+
+def get_comp_size(graph, para_shape_list):
+    computation_peak_bytes, computation_active_bytes = 0, 0
+
+    num_nodes, num_edges, feat_length = graph.features.shape[0], graph.edge_index.shape[1], graph.features.shape[1]
+
+    edge_index_size = graph.edge_index.nelement() * graph.edge_index.element_size()
+    feat_size = graph.features.nelement() * graph.features.element_size()
+    label_size = graph.labels.nelement() * graph.labels.element_size()
+
+    # padded size
+    padded_edge_index_size, padded_feat_size, padded_label_size = calc_pad(edge_index_size), calc_pad(feat_size), calc_pad(label_size)
+    padded_graph_size = [padded_edge_index_size, padded_feat_size, padded_label_size] # edge_index, features and labels
+
+    for padded_size in padded_graph_size:
+        computation_active_bytes += padded_size
+        computation_peak_bytes += padded_size
+    
+    sage_counter = 0
+    for shape in para_shape_list[0]: # assuming float32 type (output/features)
+        if len(shape) == 2 and sage_counter % 3 == 0: # 2-D weights
+            padded_aggr_output_size = calc_pad(num_nodes * shape[1] * 4) # features after aggregation
+
+            # propagate node features to edges
+            edge_feat_size = (feat_size * num_edges / num_nodes) * shape[1] / feat_length
+            padded_edge_feat_size = calc_pad(edge_feat_size)
+            computation_active_bytes += padded_edge_feat_size + padded_aggr_output_size 
+
+            if computation_active_bytes > computation_peak_bytes:
+                computation_peak_bytes = computation_active_bytes
+            computation_active_bytes -= padded_edge_feat_size # only save features
+
+            # lin_r and lin_l (linear layers) output
+            padded_lin_output_size = calc_pad(num_nodes * shape[0] * 4)
+
+            computation_active_bytes += padded_lin_output_size * 2
+            if computation_active_bytes > computation_peak_bytes:
+                computation_peak_bytes = computation_active_bytes
+        sage_counter += 1
+
+    # multiply the threshold to ensure safe memory
+    computation_peak_bytes = calc_pad(int(computation_peak_bytes * comp_ratio))
+
+    return computation_peak_bytes
diff --git a/software/task/GraphSAGE_training.py b/software/task/GraphSAGE_training.py
new file mode 100644
index 0000000..a260b01
--- /dev/null
+++ b/software/task/GraphSAGE_training.py
@@ -0,0 +1,76 @@
+import time
+import threading
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchsummary import summary
+from torch_geometric.nn import SAGEConv
+
+from task.common import *
+from task.GraphSAGE import *
+
+
+def import_func(model, graph):
+    def train(model, graph):
+        print(threading.currentThread().getName(),
+            'GraphSAGE training {} >>>>>>>>>>'.format(graph.name), time.time())
+        before_time = time.time()
+        model = model.cuda()
+        features = graph.features.cuda(non_blocking=True)
+        labels = graph.labels.cuda(non_blocking=True)
+        edge_index = graph.edge_index.to(gpu, non_blocking=True)
+
+        optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
+        loss = None
+        for epoch in range(num_epochs):
+            model.train()
+            output = model(features, edge_index)
+            loss = F.nll_loss(output, labels)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()       
+#            print('GraphSAGE {} Epoch {:05d}'.format(graph.name, epoch))
+#        training_time = time.time() - before_time    
+#        return training_time * 1000
+        return loss.item()
+
+    return train
+
+
+def import_model(data, num_layers):
+    # feats and classes of graph data
+    feat_dim, num_classes = 32, 2
+    for idata, ifeat, iclasses in gnndatasets:
+        if idata == data:
+            feat_dim, num_classes = ifeat, iclasses
+            break
+    
+    # load graph data
+    name, edge_index, features, labels = load_graph_data(data)
+    graph = GraphSummary(name, edge_index, features, labels)
+
+    model = GraphSAGE(feat_dim, hidden_dim, num_classes, num_layers, 0.5)
+
+    # set full name to disguish
+    FULL_NAME = 'GraphSAGE_{}'.format(data)
+    set_fullname(model, FULL_NAME)
+
+    return model, graph
+
+
+def import_task(data, num_layers):
+    model, graph = import_model(data, num_layers)
+    func = import_func(model, graph)
+    group_list = partition_model(model)
+    shape_summary_list = [group_to_shape(group) for group in group_list]
+    return model, func, shape_summary_list
+
+
+def import_parameters(data, num_layers):
+    model, graph = import_model(data, num_layers)
+    group_list = partition_model(model)
+    batch_list = [group_to_batch(group) for group in group_list]
+    para_shape_list = [group_to_para_shape(group) for group in group_list]
+    comp_total_bytes = get_comp_size(graph, para_shape_list)
+    return batch_list, comp_total_bytes
diff --git a/software/task/README.md b/software/task/README.md
new file mode 100644
index 0000000..de650bd
--- /dev/null
+++ b/software/task/README.md
@@ -0,0 +1,23 @@
+# Task
+
+This folder contains codes for PyG-based models, where the graph datasets are loaded using DGL interface.
+
+## Content
+
+- common.py: functions for loading model structure and graph datasets.
+
+- GCN
+    - GCN.py: classes for network design and memory estimation.
+    - GCN\_training.py: functions for importing training tasks and parameters.
+
+- GIN
+    - GIN.py: classes for network structure and memory estimation.
+    - GIN\_training.py: functions for importing training tasks and parameters.
+
+- GAT
+    - GAT.py: classes for network structure and memory estimation.
+    - GAT\_training.py: functions for importing training tasks and parameters.
+
+- SAGE
+    - GraphSAGE.py: classes for network structure and memory estimation.
+    - GraphSAGE\_training.py: functions for importing training tasks and parameters.
diff --git a/software/task/common.py b/software/task/common.py
new file mode 100644
index 0000000..9d87bfc
--- /dev/null
+++ b/software/task/common.py
@@ -0,0 +1,157 @@
+import os
+import dgl
+import torch
+
+# gnn architecture configuration
+gpu = 0
+num_epochs = 20
+hidden_dim = 64
+
+# gnn threshold
+comp_ratio = 1.15
+
+gnndatasets = [
+    ('citeseer', 3703, 6), # small datasets (< 64MB)
+    ('cora', 1433, 7),
+    ('pubmed', 500, 3),
+    ('ppi', 50, 121),
+    ('PROTEINS_full', 29, 2),
+    ('artist', 100, 12),
+    ('soc-BlogCatalog', 128, 39),
+    ('ddi', 512, 32),
+
+    ('OVCAR-8H', 66, 2), # medium datasets (64MB ~ 1GB)
+    ('Yeast', 74, 2),
+    ('DD', 89, 2),
+    ('SW-620H', 66, 2),
+    ('com-amazon', 96, 22),
+    ('amazon0601', 96, 22),
+    ('arxiv', 128, 40),
+    ('collab', 128, 32),
+    ('ppa', 58, 16),
+
+    ('reddit.dgl', 602, 50), # large datasets (> 1GB)
+    ('products', 100, 47),
+    ('protein', 8, 2),
+    ('citation', 1433, 7),
+    ('TWITTER-Real-Graph-Partial', 1323, 2)
+]
+
+
+class GraphSummary(): # graph data summary
+    def __init__(self, name, edge_index, features, labels):
+        self.name = name
+        self.edge_index = edge_index
+        self.features = features
+        self.labels = labels
+
+
+def load_graph_data(data):
+    g = dgl.load_graphs("../data/graph/{}.graph".format(data))[0][0]
+    edge_index = torch.stack((g.edges()[0], g.edges()[1]), 0).long()
+    features = g.ndata['feat']
+    labels = g.ndata['label']
+
+    return data, edge_index, features, labels
+
+## get the shapes of tensors and parameters
+kMinBlockSize = 512
+
+def calc_pad(tsize):
+    return kMinBlockSize * ((tsize + kMinBlockSize - 1) // kMinBlockSize) 
+
+def set_fullname(mod, fullname):
+    mod.fullname = fullname
+    if len(list(mod.children())) == 0:
+        for index, p in enumerate(mod.parameters()):
+            p.reserved_name = '%s->p%d' % (fullname, index)
+    for child_name, child in mod.named_children():
+        child_fullname = '%s->%s' % (fullname, child_name)
+        set_fullname(child, child_fullname)
+
+
+def group_to_shape(group):
+    shape_list = []
+    param_list = []
+    buf_list = []
+    mod_list = []
+
+    def travel_layer(mod):
+        if len(list(mod.children())) == 0:
+            mod_list.append(mod)
+        else:
+            for child in mod.children():
+                travel_layer(child)
+    for mod in group:
+        travel_layer(mod)
+
+    for mod in mod_list:
+        for p in mod.parameters():
+            shape_list.append(p.shape)
+            param_list.append(p)
+    for mod in mod_list:
+        for key, buf in mod._buffers.items():
+            if buf is not None and buf.dtype is torch.float32:
+                shape_list.append(buf.shape)
+                buf_list.append((mod, key))
+
+    return shape_list, param_list, buf_list, mod_list
+
+
+def group_to_batch(group):
+    mod_list = []
+    def travel_layer(mod):
+        if len(list(mod.children())) == 0:
+            mod_list.append(mod)
+        else:
+            for child in mod.children():
+                travel_layer(child)
+    for mod in group:
+        travel_layer(mod)
+
+    def pad(t, blockSize):
+        length = t.nelement()
+        size = length * t.element_size()
+        padded_size = blockSize * ((size + blockSize - 1) // blockSize)
+        padded_length = padded_size // t.element_size()
+        t_padded = torch.zeros(padded_length)
+        t_padded[:length] = t
+        return t_padded
+
+    tensor_list = []
+    for mod in mod_list:
+        for p in mod.parameters():
+            tensor_list.append(pad(p.view(-1), kMinBlockSize))
+    for mod in mod_list:
+        for _, buf in mod._buffers.items():
+            if buf is not None and buf.dtype is torch.float32:
+                tensor_list.append(pad(buf.view(-1), kMinBlockSize))
+    
+    if len(tensor_list) > 0:
+        batched_tensor = torch.cat(tensor_list)
+    else:
+        batched_tensor = None
+    
+    modname_list = [mod.fullname for mod in mod_list]
+
+    return batched_tensor, modname_list
+
+
+def group_to_para_shape(group):
+    mod_list = []
+    shape_list = []
+
+    def travel_layer(mod):
+        if len(list(mod.children())) == 0:
+            mod_list.append(mod)
+        else:
+            for child in mod.children():
+                travel_layer(child)
+    for mod in group:
+        travel_layer(mod)
+    
+    for mod in mod_list:
+        for p in mod.parameters():
+            shape_list.append(p.shape)
+
+    return shape_list
diff --git a/software/util/README.md b/software/util/README.md
new file mode 100644
index 0000000..8f67773
--- /dev/null
+++ b/software/util/README.md
@@ -0,0 +1,3 @@
+# Util
+
+This folder contains codes to simplify TCP interfance and print timesteps.
diff --git a/software/util/util.py b/software/util/util.py
new file mode 100644
index 0000000..37815fd
--- /dev/null
+++ b/software/util/util.py
@@ -0,0 +1,46 @@
+import sys
+import time
+import socket
+
+def timestamp(name, stage):
+    print ('TIMESTAMP, %s, %s, %f' % (name, stage, time.time()), file=sys.stderr)
+
+class TcpServer():
+    def __init__(self, address, port):
+        self.address = address
+        self.port = port
+        self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        self.sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        self.sock.bind((self.address, self.port))
+        self.sock.listen(1)
+
+    def __del__(self):
+        self.sock.close()
+
+    def accept(self):
+        conn, address = self.sock.accept()
+        return conn, address
+
+
+class TcpAgent:
+    def __init__(self, conn):
+        self.conn = conn
+
+    def __del__(self):
+        self.conn.close()
+
+    def send(self, msg):
+        self.conn.sendall(msg)
+
+    def recv(self, msg_len):
+        return self.conn.recv(msg_len, socket.MSG_WAITALL)
+
+    def settimeout(self, t):
+        self.conn.settimeout(t)
+
+
+class TcpClient(TcpAgent):
+    def __init__(self, address, port):
+        super().__init__(None)
+        self.conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        self.conn.connect((address, port))
\ No newline at end of file